我尝试使用具有经验重播和epsilon衰减的DQN来解决openai健身房中的标准小插曲示例,但我似乎无法使其收敛,实际上,损失呈指数级增长?
我尝试查看此示例,但是我的代码和他的代码之间并没有太大的区别。 https://github.com/NoumaanKaleem/cartpole_ddqn/blob/master/deep-q-learning/dqn.py
这是我的代码:
import tensorflow as tf
import numpy as np
#self.mem.add([state,at,reward,next_state,done])
class SARSD:
count = 0
def __init__(self, state, at, reward, next_state, done):
self.state = state
self.at = at
self.reward = reward
self.next_state = next_state
self.done = done
self.ID = SARSD.count
SARSD.count = SARSD.count + 1
class ReplayMemory:
def __init__(self, N):
self.mem = []
self.N = N
#add element to replay memory
#if the size of the replay memory is alrady capped, remove first item added
def add(self, elem):
self.mem.append(elem)
if len(self.mem) > self.N:
self.mem.pop(0)
def sample(self, num_elems):
mem_length = len(self.mem)
if num_elems > mem_length:
return -1
indices = np.random.randint(0,mem_length-1,num_elems)
new_array = []
to_remove = []
for i in indices:
new_array.append(self.mem[i])
to_remove.append(self.mem[i])
self.mem = [value for value in self.mem if value not in to_remove]
return new_array
def full(self):
return len(self.mem) == self.N
def clear(self):
self.mem.clear()
class DQNAgent:
def __init__(self):
self.mem = ReplayMemory(2048)
#epsilon greedy parameter
self.eps = 0.1
#learning rate
self.gamma = 0.95
#minibatch size
self.minibatch_size = 32
#epochs
self.epochs = 1
def learn(self, num_episodes, env, model):
self.mem = ReplayMemory(1000)
for episode in range(0,num_episodes):
#if episode % int(num_episodes/10) == 0:
#self.eps = self.eps - 0.2
#if self.eps < 0.05:
#self.eps = 0.05
state = env.reset()
state = np.reshape(state, [1,4])
done = False
while done == False:
at = 0
if np.random.rand() < self.eps:
#sample a random action
at = env.action_space.sample()
else:
#get the best action from policy
test = model.predict(state)
at = np.argmax(model.predict(state))
#execute action at
next_state, reward, done, info = env.step(at)
next_state = np.reshape(next_state, [1,4])
#store the transition into replay memory
data = SARSD(state,at,reward,next_state,done)
self.mem.add(data)
#EDIT1 : surely this wasn't helping
state = next_state
if self.mem.full():
#sample batch of transitions from replay memory
minibatch = self.mem.sample(self.minibatch_size)
#intialize the targets and data
y = []
x = []
for elem in minibatch:
#check if terminal state
terminal = elem.done
if terminal == True:
yj = elem.reward
else:
#do the discounted reward formula
yj = elem.reward
prediction = model.predict(elem.next_state)
tmp = np.max(model.predict(elem.next_state))
yj = yj + self.gamma * np.max(model.predict(elem.next_state))
#get the prediction
prediction = model.predict(elem.state)
#change the prediction action value with discounted reward
prediction[0][elem.at] = yj
y.append(prediction[0])
x.append(elem.state[0])
#fit the model with target y
x = np.array(x)
y = np.array(y)
model.fit(x,y,epochs=self.epochs)
这是我声明要训练的keras模型的主要python文件...
import numpy as np
import gym
import DQNAgent
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)
env = gym.make('CartPole-v0')
agent = DQNAgent.DQNAgent()
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(24, activation=tf.keras.activations.sigmoid, input_dim=4))
model.add(tf.keras.layers.Dense(24, activation=tf.keras.activations.sigmoid))
model.add(tf.keras.layers.Dense(2,activation=tf.keras.activations.linear))
model.compile(optimizer=tf.train.AdamOptimizer(0.01),
loss=tf.keras.losses.mean_squared_error)
agent.learn(1000,env,model)
#evaluate if model has been trained correctly
episode = 0
for episode in range(0,1000):
next_state = env.reset()
done = False
while done == False:
env.render()
next_state = np.reshape(next_state, [1, 4])
at = np.argmax(model.predict(next_state))
next_state, reward, done, info = env.step(at)
您可能会猜到,我对Tensorflow / Keras很陌生。
编辑1:忘记实际分配state = next_state。这有助于使损失稳定下来,但仍不能解决棘手问题...