Q学习与经验重播不学习

时间:2018-12-04 14:05:56

标签: python machine-learning reinforcement-learning q-learning

我正在尝试在OpenAI taxi-v2环境中实现体验重播(ER)。它应该使收敛更快,但是当我打开体验重播时,代理似乎没有学习。根据文献,ER应该使收敛更快,这意味着在我的实现中必须存在编码错误。我正在测试基准代理(一个没有ER)和一个并排使用ER来查看性能变化的代理。正如预期的那样,基准线代理似乎正在学习。 我花了更多的时间来完成这项工作,但仍然感到困惑。如果有人可以看一下代码,看看我在哪里犯了编码错误,并指出正确的方向,我将不胜感激。

我认为,问题出在名为play_and_train_with_replay的函数中。这是代码:

env = gym.make("Taxi-v2")
n_actions = env.action_space.n
replay = ReplayBuffer(1000)
agent = QLearningAgent(alpha=0.5, epsilon=0.25, discount=0.99,
                   get_legal_actions = lambda s: range(n_actions))
# QLearningAgent is a class that implements q-learning.


def play_and_train_with_replay(env, agent, replay=None, 
                           t_max=10**4, replay_batch_size=32):
"""
This function should 
- run a full game, actions given by agent.getAction(s)
- train agent using agent.update(...) whenever possible
- return total reward
:param replay: ReplayBuffer where agent can store and sample (s,a,r,s',done) tuples.
    If None, do not use experience replay
"""
total_reward = 0.0
s = env.reset()


for t in range(t_max):
    # get agent to pick action given state s
    a = agent.get_action(s)
    next_s, r, done, _ = env.step(a)
    agent.update(s,a,r,next_s) # update agent using q-learning

    if replay is not None: # code for agent-experienceReplay
        # store current <s,a,r,s'> transition in buffer
        replay.add(s,a,r,next_s,done)

        # sample replay_batch_size random transitions from replay, 
        slist,alist,rlist,next_slist,donelist = replay.sample(replay_batch_size) # randomly sample a batch of <s,a,r,s',done>


        # then update agent on each of them in a loop

        for indx in range(replay_batch_size): 
            s = slist[indx]
            a = alist[indx]
            r = rlist[indx]
            next_s = next_slist[indx]
            done = donelist[indx]
            agent.update(s,a,r,next_s) 


    s = next_s
    total_reward +=r

    if done:break

return total_reward

总奖励应该随着每次迭代而增加(并变为正数),但是随着经验重播的开启,奖励会趋于稳定在-40至-15之间。

如果您想了解我如何实现replay.add和replay.sample函数,代码如下。我不确定该错误是否隐藏在该处的某个地方,但是我以前错了。下面的代码通过了所有初步测试。

import random
#from collections import OrderedDict, deque
class ReplayBuffer(object):
   def __init__(self, size):
    """
    Create Replay buffer.
    Parameters
    ----------
    size: int
        Max number of transitions to store in the buffer. When the buffer
        overflows the old memories are dropped.


    """
    #self._storage = deque()
    self._storage = []
    self._maxsize = size
    self._next_index = 0

def __len__(self):
    return len(self._storage)

def add(self, obs_t, action, reward, obs_tp1, done):
    '''
    Make sure, _storage will not exceed _maxsize. 
    Make sure, FIFO rule is being followed: the oldest examples has to be removed earlier
    '''
    '''
    data = [OrderedDict({'s':obs_t,
           'a':action,
           'r': reward,
           's_prime': obs_tp1,
           'is_done': done})]

    # add data to storage
    if self.__len__() >= self._maxsize:    
        diff = abs(self._maxsize - self.__len__())
        for _ in range(diff+1): 
            self._storage.popleft()

        self._storage.extend(data)   
    #elif self.__len__()==0: 
    #    self._storage.extend(data)   
    else: 
        self._storage.extend(data)
    '''
    data = (obs_t, action, reward, obs_tp1, done)

    if  self._next_index >= self.__len__():    
        #diff = abs(self._maxsize - self.__len__())
        #for _ in range(diff+1): 
        #    self._storage.pop(0)
        self._storage.append(data)   
    else: 
        self._storage[self._next_index] = data # overrides the first entered data
    self._next_index = (self._next_index + 1)% self._maxsize # checks if the remainder is > 0 when compared to maxsize.


def sample(self, batch_size):
    """Sample a batch of experiences.
    Parameters
    ----------
    batch_size: int
        How many transitions to sample.
    Returns
    -------
    obs_batch: np.array
        batch of observations
    act_batch: np.array
        batch of actions executed given obs_batch
    rew_batch: np.array
        rewards received as results of executing act_batch
    next_obs_batch: np.array
        next set of observations seen after executing act_batch
    done_mask: np.array
        done_mask[i] = 1 if executing act_batch[i] resulted in
        the end of an episode and 0 otherwise.
    """

    #idxes = <randomly generate batch_size integers to be used as indexes of samples            
    _indxs = random.choices(range(self.__len__()), k= batch_size)

    obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = [],[],[],[],[]

    # collect <s,a,r,s',done> for each index            
    '''
    for i in _indxs:    
        obs_batch.append(self._storage[i]['s'])
        act_batch.append(self._storage[i]['a'])
        rew_batch.append(self._storage[i]['r'])
        next_obs_batch.append(self._storage[i]['s_prime'])
        done_mask.append(self._storage[i]['is_done'])

    '''
    for i in _indxs:    
        data = self._storage[i]
        obs_t, action, reward, obs_tp1, done = data
        obs_batch.append(np.array(obs_t, copy=False))
        act_batch.append(np.array(action, copy=False))
        rew_batch.append(np.array(reward, copy=False))
        next_obs_batch.append(np.array(obs_tp1, copy=False))
        done_mask.append(np.array(done, copy=False))

    return np.array(obs_batch), np.array(act_batch), np.array(rew_batch), np.array(next_obs_batch), np.array(done_mask)

0 个答案:

没有答案