我正在尝试在OpenAI taxi-v2环境中实现体验重播(ER)。它应该使收敛更快,但是当我打开体验重播时,代理似乎没有学习。根据文献,ER应该使收敛更快,这意味着在我的实现中必须存在编码错误。我正在测试基准代理(一个没有ER)和一个并排使用ER来查看性能变化的代理。正如预期的那样,基准线代理似乎正在学习。 我花了更多的时间来完成这项工作,但仍然感到困惑。如果有人可以看一下代码,看看我在哪里犯了编码错误,并指出正确的方向,我将不胜感激。
我认为,问题出在名为play_and_train_with_replay的函数中。这是代码:
env = gym.make("Taxi-v2")
n_actions = env.action_space.n
replay = ReplayBuffer(1000)
agent = QLearningAgent(alpha=0.5, epsilon=0.25, discount=0.99,
get_legal_actions = lambda s: range(n_actions))
# QLearningAgent is a class that implements q-learning.
def play_and_train_with_replay(env, agent, replay=None,
t_max=10**4, replay_batch_size=32):
"""
This function should
- run a full game, actions given by agent.getAction(s)
- train agent using agent.update(...) whenever possible
- return total reward
:param replay: ReplayBuffer where agent can store and sample (s,a,r,s',done) tuples.
If None, do not use experience replay
"""
total_reward = 0.0
s = env.reset()
for t in range(t_max):
# get agent to pick action given state s
a = agent.get_action(s)
next_s, r, done, _ = env.step(a)
agent.update(s,a,r,next_s) # update agent using q-learning
if replay is not None: # code for agent-experienceReplay
# store current <s,a,r,s'> transition in buffer
replay.add(s,a,r,next_s,done)
# sample replay_batch_size random transitions from replay,
slist,alist,rlist,next_slist,donelist = replay.sample(replay_batch_size) # randomly sample a batch of <s,a,r,s',done>
# then update agent on each of them in a loop
for indx in range(replay_batch_size):
s = slist[indx]
a = alist[indx]
r = rlist[indx]
next_s = next_slist[indx]
done = donelist[indx]
agent.update(s,a,r,next_s)
s = next_s
total_reward +=r
if done:break
return total_reward
总奖励应该随着每次迭代而增加(并变为正数),但是随着经验重播的开启,奖励会趋于稳定在-40至-15之间。
如果您想了解我如何实现replay.add和replay.sample函数,代码如下。我不确定该错误是否隐藏在该处的某个地方,但是我以前错了。下面的代码通过了所有初步测试。
import random
#from collections import OrderedDict, deque
class ReplayBuffer(object):
def __init__(self, size):
"""
Create Replay buffer.
Parameters
----------
size: int
Max number of transitions to store in the buffer. When the buffer
overflows the old memories are dropped.
"""
#self._storage = deque()
self._storage = []
self._maxsize = size
self._next_index = 0
def __len__(self):
return len(self._storage)
def add(self, obs_t, action, reward, obs_tp1, done):
'''
Make sure, _storage will not exceed _maxsize.
Make sure, FIFO rule is being followed: the oldest examples has to be removed earlier
'''
'''
data = [OrderedDict({'s':obs_t,
'a':action,
'r': reward,
's_prime': obs_tp1,
'is_done': done})]
# add data to storage
if self.__len__() >= self._maxsize:
diff = abs(self._maxsize - self.__len__())
for _ in range(diff+1):
self._storage.popleft()
self._storage.extend(data)
#elif self.__len__()==0:
# self._storage.extend(data)
else:
self._storage.extend(data)
'''
data = (obs_t, action, reward, obs_tp1, done)
if self._next_index >= self.__len__():
#diff = abs(self._maxsize - self.__len__())
#for _ in range(diff+1):
# self._storage.pop(0)
self._storage.append(data)
else:
self._storage[self._next_index] = data # overrides the first entered data
self._next_index = (self._next_index + 1)% self._maxsize # checks if the remainder is > 0 when compared to maxsize.
def sample(self, batch_size):
"""Sample a batch of experiences.
Parameters
----------
batch_size: int
How many transitions to sample.
Returns
-------
obs_batch: np.array
batch of observations
act_batch: np.array
batch of actions executed given obs_batch
rew_batch: np.array
rewards received as results of executing act_batch
next_obs_batch: np.array
next set of observations seen after executing act_batch
done_mask: np.array
done_mask[i] = 1 if executing act_batch[i] resulted in
the end of an episode and 0 otherwise.
"""
#idxes = <randomly generate batch_size integers to be used as indexes of samples
_indxs = random.choices(range(self.__len__()), k= batch_size)
obs_batch, act_batch, rew_batch, next_obs_batch, done_mask = [],[],[],[],[]
# collect <s,a,r,s',done> for each index
'''
for i in _indxs:
obs_batch.append(self._storage[i]['s'])
act_batch.append(self._storage[i]['a'])
rew_batch.append(self._storage[i]['r'])
next_obs_batch.append(self._storage[i]['s_prime'])
done_mask.append(self._storage[i]['is_done'])
'''
for i in _indxs:
data = self._storage[i]
obs_t, action, reward, obs_tp1, done = data
obs_batch.append(np.array(obs_t, copy=False))
act_batch.append(np.array(action, copy=False))
rew_batch.append(np.array(reward, copy=False))
next_obs_batch.append(np.array(obs_tp1, copy=False))
done_mask.append(np.array(done, copy=False))
return np.array(obs_batch), np.array(act_batch), np.array(rew_batch), np.array(next_obs_batch), np.array(done_mask)