我一直在研究Actor Critic网络。但是,在Lunar Lander健身房环境中,我无法获得大于〜-175的奖励。在我进行的许多试验中,经常出现的情况是该网络决定不发射任何推进器,然后掉下来。它降落在着陆垫附近,但当然会坠毁,因此得到-100的奖励。这是因为它没有探索吗?我想知道我的损失实现方式是否正确,并且使梯度流过多个损失项(策略,价值和熵)。我尝试了学习率,值和熵系数的许多变化。网络每次都认为最好跌倒。之后,它永远不会停止这样做。完整的代码如下:
import numpy as np
import pandas as pd
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import copy
import matplotlib.pyplot as plt
import random
import time
class Discrete_AC(nn.Module):
def __init__(self,o_space,a_space,hidden_layer_sizes,lr=0.005,val_coeff=0.5,ent_coeff=0.01):
super(Discrete_AC,self).__init__()
self.o_space = o_space
self.a_space = a_space
self.val_coeff = val_coeff
self.ent_coeff = ent_coeff
net_list = []
last_layer = o_space
for nodes in hidden_layer_sizes:
net_list.append(nn.Linear(last_layer,nodes))
net_list.append(nn.ReLU())
last_layer = nodes
self.net = nn.ModuleList(net_list)
self.policy_head = nn.Linear(last_layer,a_space)
self.value_head = nn.Linear(last_layer,1)
self.optimizer = optim.Adam(self.parameters(), lr=lr)
self.episode = []
def forward(self,o_space):
if type(o_space) == np.ndarray:
x = torch.tensor(o_space.astype(np.float32))
else:
x = o_space
for layer in self.net:
x = layer(x)
policy = self.policy_head(x)
value = self.value_head(x)
return F.softmax(policy,dim=-1),value
def sample_action(self,o_space):
policy,value = self.forward(o_space)
policy = policy.view(-1)
value = value.view(-1)
dist = Categorical(policy)
action = dist.sample()
return action.item()
def record(self,o_space,action,reward):
self.episode.append((o_space,action,reward))
def collect_returns(self,gamma):
G_i = 0
new_episode = []
for o,a,r in reversed(self.episode):
new_episode.append((o,a,G_i))
G_i = r + gamma*G_i
self.episode = list(reversed(new_episode))
def return_rs(self):
rs = []
for _,_,r in self.episode:
rs.append(r)
return rs
def return_Gs(self):
Gs = []
for _,_,G in self.episode:
Gs.append(G)
return Gs
def return_os(self):
os = []
for o,_,_ in self.episode:
os.append(o)
return os
def return_actions(self):
actions = []
for _,a,_ in self.episode:
actions.append(a)
return actions
def return_losses(self):
Gs = self.return_Gs()
Gs = np.array(Gs).astype(np.float32)
actions = self.return_actions()
actions = np.eye(self.a_space)[actions].astype(np.float32)
os = self.return_os()
os = np.stack(os)
policies, values = self.forward(os)
entropy_loss = torch.mean(policies*torch.log(policies))
policies = torch.sum(torch.tensor(actions)*policies,dim=-1)
value_loss = F.mse_loss(values,torch.tensor(Gs))
advantages = torch.tensor(Gs)-values.detach()
policy_loss = -torch.mean(advantages*torch.log(policies))
return policy_loss,value_loss,entropy_loss
def play_episode(self,env,render=True):
done = False
o = env.reset()
r = 0.
self.episode = []
while not done:
if render:
env.render()
a = self.sample_action(o)
o_prime,r,done,_ = env.step(a)
self.record(o,a,r)
o = o_prime
agg_r = sum(self.return_rs())
return agg_r
def train(self,gamma=1.,steps=1):
self.collect_returns(gamma)
pi_loss = 0
v_loss = 0
for step in range(steps):
self.optimizer.zero_grad()
policy_loss,value_loss,entropy_loss = self.return_losses()
loss = policy_loss + self.val_coeff*value_loss + self.ent_coeff*entropy_loss
loss.backward()
self.optimizer.step()
pi_loss = (step/steps)*pi_loss+(1-step/steps)*policy_loss.item()
v_loss = (step/steps)*v_loss+(1-step/steps)*value_loss.item()
return pi_loss,v_loss
def return_metrics(self,policy_loss,value_loss):
metrics = {'pi_loss': policy_loss,
'v_loss': value_loss}
return metrics
env = gym.make('LunarLander-v2')
agent = Discrete_AC(8,4,[512])
EPISODES = 10000#0
render_int = 100
log_int = 50
rewards = []
pi_losses = []
v_losses = []
for episode in range(EPISODES):
agg_r = agent.play_episode(env,episode % render_int == 0)
rewards.append(agg_r)
policy_loss, value_loss = agent.train()
pi_losses.append(policy_loss)
v_losses.append(value_loss)
metrics = agent.return_metrics(policy_loss, value_loss)
metrics['episode'] = episode
metrics['episodes'] = EPISODES
metrics['performance'] = np.mean(rewards[-100:])
if episode % log_int == 0:
string = '\rEpisode {episode}/{episodes} 100-episode Average
Reward: {performance:0.4f} Policy Loss: {pi_loss:0.4f} Value Loss: {v_loss:0.4f}'.format(**metrics)
print (string,end='')
env.close()
我应该一步一步地进行示例的渐变步骤,还是可以大批量进行?我还尝试过像PyTorch实现中那样规范每个情节的返回。这导致学习速度非常慢,需要花费数万集才能减少奖励,到目前为止,我还从未见过它收敛。