Question

我正在尝试实现一种DQN算法，该算法通过在每个时间步长给出游戏的RAM状态作为输入来训练代理从开放AI Gym Atari环境播放突破。我使用了jaara https://github.com/jaara/AI-blog/blob/master/Seaquest-DDQN-PER.py#L102的AI-Blog存储库中的代码，并对它进行了一些更改。这是代码：

import random, numpy, math, gym
from SumTree import SumTree
import tensorflow as tf
import numpy as np
from tensorflow.keras import backend as K
import scipy.misc

# -----------------HYPER PARAMETERS--------------
# IMAGE_WIDTH = 84
# IMAGE_HEIGHT = 84
RAM_SIZE = 128
IMAGE_STACK = 2

HUBER_LOSS_DELTA = 2.0
LEARNING_RATE = 0.00025

MEMORY_CAPACITY = 200000

BATCH_SIZE = 32

GAMMA = 0.99

MAX_EPSILON = 1
MIN_EPSILON = 0.1

EXPLORATION_STOP = 500000   # at this step epsilon will be 0.01
LAMBDA = - math.log(0.01) / EXPLORATION_STOP  # speed of decay

UPDATE_TARGET_FREQUENCY = 10000

#-------------------- UTILITIES -----------------------
def huber_loss(y_true, y_pred):
    err = y_true - y_pred

    cond = K.abs(err) < HUBER_LOSS_DELTA
    L2 = 0.5 * K.square(err)
    L1 = HUBER_LOSS_DELTA * (K.abs(err) - 0.5 * HUBER_LOSS_DELTA)

    loss = tf.where(cond, L2, L1)   # Keras does not cover where function in tensorflow :-(

    return K.mean(loss)

# def processImage( ram ):
#     rgb = scipy.misc.imresize(ram, (IMAGE_WIDTH, IMAGE_HEIGHT), interp='bilinear')
#
#     r, g, b = rgb[:,:,0], rgb[:,:,1], rgb[:,:,2]
#     gray = 0.2989 * r + 0.5870 * g + 0.1140 * b     # extract luminance
#
#     o = gray.astype('float32') / 128 - 1    # normalize
#     return o

def save_model(agent, problem, algorithm_name=None):
    file_name = ("saved_models\\"
                           + problem +
                           "-" + datetime.datetime.now().strftime("%Y-%m-%d-%H-%M"))
    if algorithm_name:
        file_name += "-" + algorithm_name + ".h5"
    else:
        file_name += ".h5"
    agent.brain.model.save(file_name)


#-------------------- BRAIN ---------------------------
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *

class Brain:
    def __init__(self, stateCnt, actionCnt, load_file=None):
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt
        self.history = None

        self.model = self._createModel()
        self.model_ = self._createModel()          # target network
        if load_file:
            self.model.load_weights(load_file)
            self.model.load_weights(load_file)


    def _createModel(self):
        model = Sequential()

        model.add(Dense(units=128, activation="relu", input_dim=self.stateCnt))
        model.add(Dense(units=self.actionCnt, activation='linear', input_dim=128))

        opt = RMSprop(lr=LEARNING_RATE)
        model.compile(loss=huber_loss, optimizer=opt)

        return model

    def train(self, x, y, epochs=1, verbose=0):
        self.history = self.model.fit(x, y, batch_size=32, epochs=epochs, verbose=verbose)
        # print(history.history["val_loss"])

    def predict(self, s, target=False):
        if target:
            return self.model_.predict(s)
        else:
            return self.model.predict(s)

    def predictOne(self, s, target=False):
        return self.predict(s.reshape(1, IMAGE_STACK*RAM_SIZE), target).flatten()

    def updateTargetModel(self):
        self.model_.set_weights(self.model.get_weights())

#-------------------- MEMORY --------------------------
class Memory:   # stored as ( s, a, r, s_ ) in SumTree
    e = 0.01  # epsilon, prevent error from falling below 0
    a = 0.6  # alpha, the degree of bias, with 0 meaning no bias at all

    def __init__(self, capacity):
        self.tree = SumTree(capacity)

    def _getPriority(self, error):
        return (error + self.e) ** self.a

    def add(self, error, sample):
        p = self._getPriority(error)
        self.tree.add(p, sample)

    def sample(self, n):
        batch = []
        segment = self.tree.total() / n

        for i in range(n):
            a = segment * i
            b = segment * (i + 1)

            s = random.uniform(a, b)
            (idx, p, data) = self.tree.get(s)
            batch.append((idx, data))

        return batch


    def update(self, idx, error):
        """
        Update the priority value of given entry
        :param idx: The index of the given entry
        :param error: The error value to be updated.
        :return: None
        """
        p = self._getPriority(error)
        self.tree.update(idx, p)

#-------------------- AGENT ---------------------------


class Agent:
    steps = 0
    epsilon = MAX_EPSILON

    def __init__(self, stateCnt, actionCnt, file=None):
        """
        Initialize an agent, specifying the shape of the states and number of actions
        :param (int, int) stateCnt: (x, y) tuple specifying the shape of the state
                x: the number of arguments in a state e.g. size of the ram
                y: number of frames seen by the agent
        :param actionCnt: The number of actions this agent can do
        :param file: The model (e.g: .h5) file that's being loaded into the agents' brain.
        """
        self.stateCnt = stateCnt
        self.actionCnt = actionCnt

        self.brain = Brain(stateCnt, actionCnt, file)
        self.memory = Memory(MEMORY_CAPACITY)

    def act(self, s):
        """
        Do an action according to the current state
        :param numpyArray s: the current state.
        :return: int: the action that's being done
        """
        if random.random() < self.epsilon:
            return random.randint(0, self.actionCnt-1)
        else:
            return numpy.argmax(self.brain.predictOne(s))

    def observe(self, sample):  # in (s, a, r, s_) format
        """
        Add a sample to its memory
        :param tuple sample: the (s, a, r, s_) sample to be added. s, s_ are array of size STACK_SIZE
        :return: None
        """
        x, y, errors = self._getTargets([(0, sample)])
        self.memory.add(errors, sample)

        if self.steps % UPDATE_TARGET_FREQUENCY == 0:
            self.brain.updateTargetModel()

        # slowly decrease Epsilon based on our eperience
        self.steps += 1
        self.epsilon = MIN_EPSILON + (MAX_EPSILON - MIN_EPSILON) * math.exp(-LAMBDA * self.steps)

    def _getTargets(self, batch):
        """
        Get the list of estimated and target Q values for a given batch )
        :param list batch: The given [(error, (s, a, s', r))] samples
        :return: tuple (list[float], list[float], list[float]): Return three values: x, y, error
                x: list of estimated Q(s, a) value
                y: list of estimated target Q(s, a) value, which is r + gamma*maxQ_(s, a)
                error: list of MSE between x and y.
        """
        no_state = numpy.zeros(self.stateCnt)

        states = numpy.array([ sample[1][0] for sample in batch ])
        states_ = numpy.array([ (no_state if sample[1][3] is None else sample[1][3]) for sample in batch ])

        p = agent.brain.predict(states)   # estimated Q values for each sample in the batch

        p_ = agent.brain.predict(states_, target=False)
        pTarget_ = agent.brain.predict(states_, target=True)

        x = numpy.zeros((len(batch), IMAGE_STACK*RAM_SIZE))
        y = numpy.zeros((len(batch), self.actionCnt))
        errors = numpy.zeros(len(batch))

        for i in range(len(batch)):
            sample = batch[i][1]   # the i is the index, 1 is the actual sample
            s = sample[0]; a = sample[1]; r = sample[2]; s_ = sample[3]

            target = p[i]     # target Q value for the i-th state
            oldVal = target[a]
            if s_ is None:
                target[a] = r
            else:
                target[a] = r + GAMMA * pTarget_[i][ numpy.argmax(p_[i]) ]  # double DQN

            x[i] = s
            y[i] = target
            errors[i] = abs(oldVal - target[a])

        return (x, y, errors)

    def replay(self):
        """
        Take a batch from the agent's memory, get the x and y data and train it in the brain.
        Also update the error values (priorities) of the entries in the batch.
        :return: None
        """
        batch = self.memory.sample(BATCH_SIZE)
        x, y, errors = self._getTargets(batch)

        # update errors
        for i in range(len(batch)):
            idx = batch[i][0]
            self.memory.update(idx, errors[i])

        self.brain.train(x, y)


class RandomAgent:
    memory = Memory(MEMORY_CAPACITY)
    exp = 0

    def __init__(self, actionCnt):
        self.actionCnt = actionCnt

    def act(self, s):
        return random.randint(0, self.actionCnt-1)

    def observe(self, sample):
        """
        Add a sample to its memory
        :param 4-tuple sample: the (s, a, r, s_) sample to be added
        :return: None
        """
        # in (s, a, r, s_) format
        error = abs(sample[2])  # reward
        self.memory.add(error, sample)
        self.exp += 1

    def replay(self):
        pass

#-------------------- ENVIRONMENT ---------------------
class Environment:
    def __init__(self, problem):
        self.problem = problem
        self.env = gym.make(problem)
        self.frames = 0
        self.episodes = 0
        self.R_40epi = 0

    def run(self, agent):
        ram = self.env.reset()
        # w = processImage(ram)
        s = numpy.concatenate((ram, numpy.zeros(128*(IMAGE_STACK-1))))

        R = 0
        last_action = 0
        while True:
            self.env.render()
            self.frames += 1

            # Frame skipping
            # if self.frames % IMAGE_STACK == 0:
            a = agent.act(s)
                # last_action = a
            # else:
            #     a = last_action

            r = 0
            ram, r, done, info = self.env.step(a)
            s_ = numpy.concatenate((s[128:128*IMAGE_STACK], ram))  # last two screens

            r = np.clip(r, -1, 1)   # clip reward to [-1, 1]

            if done:  # terminal state
                s_ = None

            agent.observe( (s, a, r, s_) )
            agent.replay()            

            s = s_
            R += r
            if done:
                self.R_40epi += R
                break

        info = ("Total reward: " + str(R) + " " +
              "Episode:" + str(self.episodes) + " " +
              "Frames:" + str(self.frames) + " " +
              datetime.datetime.now().strftime("%Y-%m-%d-%H:%M:%S"))
        if not type(agent) is RandomAgent and agent.brain.history is not None:
            info = (info + " loss: " + str(agent.brain.history.history["loss"]))
        print(info)
        if self.episodes % 40 == 0:
            print("average in last 40 episodes:", self.R_40epi/40)
            self.R_40epi = 0
        self.episodes += 1
        # save every 30 min
        if datetime.datetime.now().strftime("%M") == "00" and type(agent) is not RandomAgent:
            save_model(agent, self.problem, "ddqn-ram")

#-------------------- MAIN ----------------------------
import datetime
import sys
PROBLEM = 'Breakout-ram-v0'
env = Environment(PROBLEM)
# file = "saved_models\Breakout-ram-v0-2018-08-17-16-46-ddqn-ram.h5"

stateCnt  = IMAGE_STACK*RAM_SIZE
actionCnt = env.env.action_space.n

agent = Agent(stateCnt, actionCnt)
randomAgent = RandomAgent(actionCnt)

try:
    print("Initialization with random agent...")
    while randomAgent.exp < MEMORY_CAPACITY:
        env.run(randomAgent)
        print(randomAgent.exp, "/", MEMORY_CAPACITY)

    agent.memory = randomAgent.memory

    randomAgent = None

    print("Starting learning")
    env.frames = 0
    env.episodes = 0

    # S = env.env.step(env.env.action_space.sample)[0]
    while True:
        env.run(agent)
finally:
    save_model(agent, PROBLEM, "ddqn-ram-single128")

我遇到的问题是，当我尝试使用此代码训练代理时，代理每集获得的平均奖励首先增加，但是一旦奖励达到3到4（大约在一百万个时间步内发生）），它开始减少并稳定在1，无论我训练了多长时间，再也不会增加（大多数算法会获得60到100的奖励）。原始代码和修改后的版本之间的区别在于，我使用游戏的RAM状态作为状态而不是图片，我仅使用单个128节点密集的隐藏层，并且我在玩Breakout而不是Sea Quest。原始代码正在播放。该代码还具有双重DQN，奖励削减和优先体验重放。可能是什么原因引起的？读取RAM而不是游戏帧会引起问题吗？

作为参考，这是我使用的“求和树”数据结构的实现：导入numpy

class SumTree:

    def __init__(self, capacity):
        """
        Initialize a sum tree structure
        :param capacity: the number of values the tree can store
        """
        self.capacity = capacity
        self.tree = numpy.zeros( 2*capacity - 1 )  # the numpy array representing the actual tree
        self.data = numpy.zeros( capacity, dtype=object )  # the array representing the data (leaf) of the tree
        self.write = 0

    def _propagate(self, idx, change):
        parent = (idx - 1) // 2

        self.tree[parent] += change

        if parent != 0:
            self._propagate(parent, change)

    def _retrieve(self, idx, s):
        left = 2 * idx + 1
        right = left + 1

        if left >= len(self.tree):
            return idx

        if s <= self.tree[left]:
            return self._retrieve(left, s)
        else:
            return self._retrieve(right, s-self.tree[left])

    def total(self):
        return self.tree[0]

    def add(self, p, data):
        idx = self.write + self.capacity - 1

        self.data[self.write] = data
        self.update(idx, p)

        self.write += 1
        if self.write >= self.capacity:
            self.write = 0

    def update(self, idx, p):
        change = p - self.tree[idx]

        self.tree[idx] = p
        self._propagate(idx, change)

    def get(self, s):
        idx = self._retrieve(0, s)
        dataIdx = idx - self.capacity + 1

        return (idx, self.tree[idx], self.data[dataIdx])

Answer 1

我认为主要原因有两个：

算法使用优先级重播。该算法为具有较高时差误差的重放存储器提供了较高的被选择概率，因为这意味着RL无法预测给定这些状态的正确Q值，因此通过更频繁地选择这些状态，您的模型将训练为这些州的情况更好。但是问题在于这些状态只是整个状态空间的一个子集，因此您的模型将偏向该子集，并且在其余的状态空间中表现不佳。当您训练模型的时间更长时，这尤其成问题，因为只有一小部分状态会具有非常大的误差。为避免这种情况，您可以取消优先级重播。请在此处查看原始文件：https://arxiv.org/abs/1511.05952
您可能还希望降低学习率，或者随着培训的进行增加批次数量。根据今年早些时候在Google上发表的一篇新论文，这两者显然是等效的。 https://openreview.net/forum?id=B1Yy1BxCZ 随着训练的进行，这将使您的模型的学习速率非常缓慢地变为0，实际上会在一段时间后停止训练。因为如果您从未降低学习速度，那么不幸的一批不良数据可能会破坏神经网络的权重。

训练一段时间后，DQN平均奖励减少

1 个答案: