我正在尝试实现一种DQN算法,该算法通过在每个时间步长给出游戏的RAM状态作为输入来训练代理从开放AI Gym Atari环境播放突破。我使用了jaara https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py#L102的AI-Blog存储库中的代码,并对它进行了一些更改。这是代码:
import random, numpy, math, gym
from SumTree import SumTree
import tensorflow as tf
import numpy as np
from tensorflow.keras import backend as K
import scipy.misc
# -----------------HYPER PARAMETERS--------------
# IMAGE_WIDTH = 84
# IMAGE_HEIGHT = 84
RAM_SIZE = 128
IMAGE_STACK = 2
HUBER_LOSS_DELTA = 2.0
LEARNING_RATE = 0.00025
MEMORY_CAPACITY = 200000
BATCH_SIZE = 32
GAMMA = 0.99
MAX_EPSILON = 1
MIN_EPSILON = 0.1
EXPLORATION_STOP = 500000 # at this step epsilon will be 0.01
LAMBDA = - math.log(0.01) / EXPLORATION_STOP # speed of decay
UPDATE_TARGET_FREQUENCY = 10000
#-------------------- UTILITIES -----------------------
def huber_loss(y_true, y_pred):
err = y_true - y_pred
cond = K.abs(err) < HUBER_LOSS_DELTA
L2 = 0.5 * K.square(err)
L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)
loss = tf.where(cond, L2, L1) # Keras does not cover where function in tensorflow :-(
return K.mean(loss)
# def processImage( ram ):
# rgb = scipy.misc.imresize(ram, (IMAGE_WIDTH, IMAGE_HEIGHT), interp='bilinear')
#
# r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
# gray = 0.2989 * r + 0.5870 * g + 0.1140 * b # extract luminance
#
# o = gray.astype('float32') / 128 - 1 # normalize
# return o
def save_model(agent, problem, algorithm_name=None):
file_name = ("saved_models\\"
+ problem +
"-" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"))
if algorithm_name:
file_name += "-" + algorithm_name + ".h5"
else:
file_name += ".h5"
agent.brain.model.save(file_name)
#-------------------- BRAIN ---------------------------
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
class Brain:
def __init__(self, stateCnt, actionCnt, load_file=None):
self.stateCnt = stateCnt
self.actionCnt = actionCnt
self.history = None
self.model = self._createModel()
self.model_ = self._createModel() # target network
if load_file:
self.model.load_weights(load_file)
self.model.load_weights(load_file)
def _createModel(self):
model = Sequential()
model.add(Dense(units=128, activation="relu", input_dim=self.stateCnt))
model.add(Dense(units=self.actionCnt, activation='linear', input_dim=128))
opt = RMSprop(lr=LEARNING_RATE)
model.compile(loss=huber_loss, optimizer=opt)
return model
def train(self, x, y, epochs=1, verbose=0):
self.history = self.model.fit(x, y, batch_size=32, epochs=epochs, verbose=verbose)
# print(history.history["val_loss"])
def predict(self, s, target=False):
if target:
return self.model_.predict(s)
else:
return self.model.predict(s)
def predictOne(self, s, target=False):
return self.predict(s.reshape(1, IMAGE_STACK*RAM_SIZE), target).flatten()
def updateTargetModel(self):
self.model_.set_weights(self.model.get_weights())
#-------------------- MEMORY --------------------------
class Memory: # stored as ( s, a, r, s_ ) in SumTree
e = 0.01 # epsilon, prevent error from falling below 0
a = 0.6 # alpha, the degree of bias, with 0 meaning no bias at all
def __init__(self, capacity):
self.tree = SumTree(capacity)
def _getPriority(self, error):
return (error + self.e) ** self.a
def add(self, error, sample):
p = self._getPriority(error)
self.tree.add(p, sample)
def sample(self, n):
batch = []
segment = self.tree.total() / n
for i in range(n):
a = segment * i
b = segment * (i + 1)
s = random.uniform(a, b)
(idx, p, data) = self.tree.get(s)
batch.append((idx, data))
return batch
def update(self, idx, error):
"""
Update the priority value of given entry
:param idx: The index of the given entry
:param error: The error value to be updated.
:return: None
"""
p = self._getPriority(error)
self.tree.update(idx, p)
#-------------------- AGENT ---------------------------
class Agent:
steps = 0
epsilon = MAX_EPSILON
def __init__(self, stateCnt, actionCnt, file=None):
"""
Initialize an agent, specifying the shape of the states and number of actions
:param (int, int) stateCnt: (x, y) tuple specifying the shape of the state
x: the number of arguments in a state e.g. size of the ram
y: number of frames seen by the agent
:param actionCnt: The number of actions this agent can do
:param file: The model (e.g: .h5) file that's being loaded into the agents' brain.
"""
self.stateCnt = stateCnt
self.actionCnt = actionCnt
self.brain = Brain(stateCnt, actionCnt, file)
self.memory = Memory(MEMORY_CAPACITY)
def act(self, s):
"""
Do an action according to the current state
:param numpyArray s: the current state.
:return: int: the action that's being done
"""
if random.random() < self.epsilon:
return random.randint(0, self.actionCnt-1)
else:
return numpy.argmax(self.brain.predictOne(s))
def observe(self, sample): # in (s, a, r, s_) format
"""
Add a sample to its memory
:param tuple sample: the (s, a, r, s_) sample to be added. s, s_ are array of size STACK_SIZE
:return: None
"""
x, y, errors = self._getTargets([(0, sample)])
self.memory.add(errors, sample)
if self.steps % UPDATE_TARGET_FREQUENCY == 0:
self.brain.updateTargetModel()
# slowly decrease Epsilon based on our eperience
self.steps += 1
self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)
def _getTargets(self, batch):
"""
Get the list of estimated and target Q values for a given batch )
:param list batch: The given [(error, (s, a, s', r))] samples
:return: tuple (list[float], list[float], list[float]): Return three values: x, y, error
x: list of estimated Q(s, a) value
y: list of estimated target Q(s, a) value, which is r + gamma*maxQ_(s, a)
error: list of MSE between x and y.
"""
no_state = numpy.zeros(self.stateCnt)
states = numpy.array([ sample[1][0] for sample in batch ])
states_ = numpy.array([ (no_state if sample[1][3] is None else sample[1][3]) for sample in batch ])
p = agent.brain.predict(states) # estimated Q values for each sample in the batch
p_ = agent.brain.predict(states_, target=False)
pTarget_ = agent.brain.predict(states_, target=True)
x = numpy.zeros((len(batch), IMAGE_STACK*RAM_SIZE))
y = numpy.zeros((len(batch), self.actionCnt))
errors = numpy.zeros(len(batch))
for i in range(len(batch)):
sample = batch[i][1] # the i is the index, 1 is the actual sample
s = sample[0]; a = sample[1]; r = sample[2]; s_ = sample[3]
target = p[i] # target Q value for the i-th state
oldVal = target[a]
if s_ is None:
target[a] = r
else:
target[a] = r + GAMMA * pTarget_[i][ numpy.argmax(p_[i]) ] # double DQN
x[i] = s
y[i] = target
errors[i] = abs(oldVal - target[a])
return (x, y, errors)
def replay(self):
"""
Take a batch from the agent's memory, get the x and y data and train it in the brain.
Also update the error values (priorities) of the entries in the batch.
:return: None
"""
batch = self.memory.sample(BATCH_SIZE)
x, y, errors = self._getTargets(batch)
# update errors
for i in range(len(batch)):
idx = batch[i][0]
self.memory.update(idx, errors[i])
self.brain.train(x, y)
class RandomAgent:
memory = Memory(MEMORY_CAPACITY)
exp = 0
def __init__(self, actionCnt):
self.actionCnt = actionCnt
def act(self, s):
return random.randint(0, self.actionCnt-1)
def observe(self, sample):
"""
Add a sample to its memory
:param 4-tuple sample: the (s, a, r, s_) sample to be added
:return: None
"""
# in (s, a, r, s_) format
error = abs(sample[2]) # reward
self.memory.add(error, sample)
self.exp += 1
def replay(self):
pass
#-------------------- ENVIRONMENT ---------------------
class Environment:
def __init__(self, problem):
self.problem = problem
self.env = gym.make(problem)
self.frames = 0
self.episodes = 0
self.R_40epi = 0
def run(self, agent):
ram = self.env.reset()
# w = processImage(ram)
s = numpy.concatenate((ram, numpy.zeros(128*(IMAGE_STACK-1))))
R = 0
last_action = 0
while True:
self.env.render()
self.frames += 1
# Frame skipping
# if self.frames % IMAGE_STACK == 0:
a = agent.act(s)
# last_action = a
# else:
# a = last_action
r = 0
ram, r, done, info = self.env.step(a)
s_ = numpy.concatenate((s[128:128*IMAGE_STACK], ram)) # last two screens
r = np.clip(r, -1, 1) # clip reward to [-1, 1]
if done: # terminal state
s_ = None
agent.observe( (s, a, r, s_) )
agent.replay()
s = s_
R += r
if done:
self.R_40epi += R
break
info = ("Total reward: " + str(R) + " " +
"Episode:" + str(self.episodes) + " " +
"Frames:" + str(self.frames) + " " +
datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
if not type(agent) is RandomAgent and agent.brain.history is not None:
info = (info + " loss: " + str(agent.brain.history.history["loss"]))
print(info)
if self.episodes % 40 == 0:
print("average in last 40 episodes:", self.R_40epi/40)
self.R_40epi = 0
self.episodes += 1
# save every 30 min
if datetime.datetime.now().strftime("%M") == "00" and type(agent) is not RandomAgent:
save_model(agent, self.problem, "ddqn-ram")
#-------------------- MAIN ----------------------------
import datetime
import sys
PROBLEM = 'Breakout-ram-v0'
env = Environment(PROBLEM)
# file = "saved_models\Breakout-ram-v0-2018-08-17-16-46-ddqn-ram.h5"
stateCnt = IMAGE_STACK*RAM_SIZE
actionCnt = env.env.action_space.n
agent = Agent(stateCnt, actionCnt)
randomAgent = RandomAgent(actionCnt)
try:
print("Initialization with random agent...")
while randomAgent.exp < MEMORY_CAPACITY:
env.run(randomAgent)
print(randomAgent.exp, "/", MEMORY_CAPACITY)
agent.memory = randomAgent.memory
randomAgent = None
print("Starting learning")
env.frames = 0
env.episodes = 0
# S = env.env.step(env.env.action_space.sample)[0]
while True:
env.run(agent)
finally:
save_model(agent, PROBLEM, "ddqn-ram-single128")
我遇到的问题是,当我尝试使用此代码训练代理时,代理每集获得的平均奖励首先增加,但是一旦奖励达到3到4(大约在一百万个时间步内发生) ),它开始减少并稳定在1,无论我训练了多长时间,再也不会增加(大多数算法会获得60到100的奖励)。原始代码和修改后的版本之间的区别在于,我使用游戏的RAM状态作为状态而不是图片,我仅使用单个128节点密集的隐藏层,并且我在玩Breakout而不是Sea Quest。原始代码正在播放。该代码还具有双重DQN,奖励削减和优先体验重放。可能是什么原因引起的?读取RAM而不是游戏帧会引起问题吗?
作为参考,这是我使用的“求和树”数据结构的实现: 导入numpy
class SumTree:
def __init__(self, capacity):
"""
Initialize a sum tree structure
:param capacity: the number of values the tree can store
"""
self.capacity = capacity
self.tree = numpy.zeros( 2*capacity - 1 ) # the numpy array representing the actual tree
self.data = numpy.zeros( capacity, dtype=object ) # the array representing the data (leaf) of the tree
self.write = 0
def _propagate(self, idx, change):
parent = (idx - 1) // 2
self.tree[parent] += change
if parent != 0:
self._propagate(parent, change)
def _retrieve(self, idx, s):
left = 2 * idx + 1
right = left + 1
if left >= len(self.tree):
return idx
if s <= self.tree[left]:
return self._retrieve(left, s)
else:
return self._retrieve(right, s-self.tree[left])
def total(self):
return self.tree[0]
def add(self, p, data):
idx = self.write + self.capacity - 1
self.data[self.write] = data
self.update(idx, p)
self.write += 1
if self.write >= self.capacity:
self.write = 0
def update(self, idx, p):
change = p - self.tree[idx]
self.tree[idx] = p
self._propagate(idx, change)
def get(self, s):
idx = self._retrieve(0, s)
dataIdx = idx - self.capacity + 1
return (idx, self.tree[idx], self.data[dataIdx])
答案 0 :(得分:1)
我认为主要原因有两个:
算法使用优先级重播。该算法为具有较高时差误差的重放存储器提供了较高的被选择概率,因为这意味着RL无法预测给定这些状态的正确Q值,因此通过更频繁地选择这些状态,您的模型将训练为这些州的情况更好。但是问题在于这些状态只是整个状态空间的一个子集,因此您的模型将偏向该子集,并且在其余的状态空间中表现不佳。当您训练模型的时间更长时,这尤其成问题,因为只有一小部分状态会具有非常大的误差。为避免这种情况,您可以取消优先级重播。请在此处查看原始文件:https://arxiv.org/abs/1511.05952
您可能还希望降低学习率,或者随着培训的进行增加批次数量。根据今年早些时候在Google上发表的一篇新论文,这两者显然是等效的。 https://openreview.net/forum?id=B1Yy1BxCZ 随着训练的进行,这将使您的模型的学习速率非常缓慢地变为0,实际上会在一段时间后停止训练。因为如果您从未降低学习速度,那么不幸的一批不良数据可能会破坏神经网络的权重。