我的Actor-Critic实施有什么问题

时间:2018-06-20 15:22:24

标签: pytorch reinforcement-learning

我一直在研究Actor Critic网络。但是,在Lunar Lander健身房环境中,我无法获得大于〜-175的奖励。在我进行的许多试验中,经常出现的情况是该网络决定不发射任何推进器,然后掉下来。它降落在着陆垫附近,但当然会坠毁,因此得到-100的奖励。这是因为它没有探索吗?我想知道我的损失实现方式是否正确,并且使梯度流过多个损失项(策略,价值和熵)。我尝试了学习率,值和熵系数的许多变化。网络每次都认为最好跌倒。之后,它永远不会停止这样做。完整的代码如下:

import numpy as np
import pandas as pd
import gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import copy
import matplotlib.pyplot as plt
import random
import time


class Discrete_AC(nn.Module):

    def __init__(self,o_space,a_space,hidden_layer_sizes,lr=0.005,val_coeff=0.5,ent_coeff=0.01):
        super(Discrete_AC,self).__init__()

        self.o_space = o_space
        self.a_space = a_space
        self.val_coeff = val_coeff
        self.ent_coeff = ent_coeff

        net_list = []
        last_layer = o_space
        for nodes in hidden_layer_sizes:
            net_list.append(nn.Linear(last_layer,nodes))
            net_list.append(nn.ReLU())
            last_layer = nodes
        self.net = nn.ModuleList(net_list)
        self.policy_head = nn.Linear(last_layer,a_space)
        self.value_head = nn.Linear(last_layer,1)

        self.optimizer = optim.Adam(self.parameters(), lr=lr)

        self.episode = []

    def forward(self,o_space):
        if type(o_space) == np.ndarray:
            x = torch.tensor(o_space.astype(np.float32))
        else:
            x = o_space

        for layer in self.net:
            x = layer(x)
        policy = self.policy_head(x)
        value = self.value_head(x)
        return F.softmax(policy,dim=-1),value

    def sample_action(self,o_space):
        policy,value = self.forward(o_space)
        policy = policy.view(-1)
        value = value.view(-1)
        dist = Categorical(policy)
        action = dist.sample()
        return action.item()

    def record(self,o_space,action,reward):
        self.episode.append((o_space,action,reward))

    def collect_returns(self,gamma):
        G_i = 0
        new_episode = []
        for o,a,r in reversed(self.episode):
            new_episode.append((o,a,G_i))
            G_i = r + gamma*G_i
        self.episode = list(reversed(new_episode))

    def return_rs(self):
        rs = []
        for _,_,r in self.episode:
            rs.append(r)
        return rs

    def return_Gs(self):
        Gs = []
        for _,_,G in self.episode:
            Gs.append(G)
        return Gs

    def return_os(self):
        os = []
        for o,_,_ in self.episode:
            os.append(o)
        return os

    def return_actions(self):
        actions = []
        for _,a,_ in self.episode:
            actions.append(a)
        return actions

    def return_losses(self):
        Gs = self.return_Gs()
        Gs = np.array(Gs).astype(np.float32)

        actions = self.return_actions()
        actions = np.eye(self.a_space)[actions].astype(np.float32)

        os = self.return_os()
        os = np.stack(os)

        policies, values = self.forward(os)
        entropy_loss = torch.mean(policies*torch.log(policies))

        policies = torch.sum(torch.tensor(actions)*policies,dim=-1)
        value_loss = F.mse_loss(values,torch.tensor(Gs))
        advantages = torch.tensor(Gs)-values.detach()
        policy_loss = -torch.mean(advantages*torch.log(policies))

        return policy_loss,value_loss,entropy_loss

    def play_episode(self,env,render=True):
        done = False
        o = env.reset()
        r = 0.
        self.episode = []
        while not done:
            if render:
                env.render()
            a = self.sample_action(o)
            o_prime,r,done,_ = env.step(a)
            self.record(o,a,r)
            o = o_prime

        agg_r = sum(self.return_rs())
        return agg_r

    def train(self,gamma=1.,steps=1):
        self.collect_returns(gamma)
        pi_loss = 0
        v_loss = 0
        for step in range(steps):
            self.optimizer.zero_grad()
            policy_loss,value_loss,entropy_loss = self.return_losses()
            loss = policy_loss + self.val_coeff*value_loss + self.ent_coeff*entropy_loss
            loss.backward()
            self.optimizer.step()
            pi_loss = (step/steps)*pi_loss+(1-step/steps)*policy_loss.item()
            v_loss = (step/steps)*v_loss+(1-step/steps)*value_loss.item()
        return pi_loss,v_loss

    def return_metrics(self,policy_loss,value_loss):
        metrics = {'pi_loss': policy_loss,
                   'v_loss': value_loss}
        return metrics

env = gym.make('LunarLander-v2')
agent = Discrete_AC(8,4,[512])

EPISODES = 10000#0
render_int = 100
log_int = 50
rewards = []
pi_losses = []
v_losses = []

for episode in range(EPISODES):
    agg_r = agent.play_episode(env,episode % render_int == 0)
    rewards.append(agg_r)
    policy_loss, value_loss = agent.train()
    pi_losses.append(policy_loss)
    v_losses.append(value_loss)
    metrics = agent.return_metrics(policy_loss, value_loss)
    metrics['episode'] = episode
    metrics['episodes'] = EPISODES
    metrics['performance'] = np.mean(rewards[-100:])
    if episode % log_int == 0:
        string = '\rEpisode {episode}/{episodes} 100-episode Average 
        Reward: {performance:0.4f} Policy Loss: {pi_loss:0.4f} Value Loss: {v_loss:0.4f}'.format(**metrics)
        print (string,end='')
    env.close()

我应该一步一步地进行示例的渐变步骤,还是可以大批量进行?我还尝试过像PyTorch实现中那样规范每个情节的返回。这导致学习速度非常慢,需要花费数万集才能减少奖励,到目前为止,我还从未见过它收敛。

0 个答案:

没有答案