训练A3C时所有动作的动作值都变为nan

时间:2020-09-04 08:53:18

标签: python pytorch artificial-intelligence reinforcement-learning

我正在使用pytorch制作具有4个过程的A3C,如下代码所示。

但是令我惊讶的是,在训练A3C动作值时,所有动作都变成了nan。 最初,动作值不是nan。

但是经过一整夜的训练后,它进入了南。 有人可以帮忙让我知道那里是什么问题。

class SharedAdam(torch.optim.Adam):
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.99), eps=1e-8,
                 weight_decay=0):
        super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        # State initialization
        for group in self.param_groups:
            for p in group['params']:
                state = self.state[p]
                state['step'] = 0
                state['exp_avg'] = torch.zeros_like(p.data)
                state['exp_avg_sq'] = torch.zeros_like(p.data)
    
                # share in memory
                state['exp_avg'].share_memory_()
                state['exp_avg_sq'].share_memory_()



class ActorCritic(torch.nn.Module):

    def __init__(self, num_inputs, action_space):
        super(ActorCritic, self).__init__()

        self.num_inputs = num_inputs
        self.action_space = action_space
        self.lstm = nn.LSTMCell(num_inputs, num_inputs)
        num_outputs = action_space
        self.fc1 = nn.Linear(num_inputs, 256)
        self.fc1.apply(init_weights)
        self.fc2 = nn.Linear(256, 256)
        self.fc2.apply(init_weights)
        self.critic_linear = nn.Linear(256, 1)
        self.critic_linear.apply(init_weights)
        self.actor_linear = nn.Linear(256, num_outputs)
        self.actor_linear.apply(init_weights)
        self.lstm.bias_ih.data.fill_(0)
        self.lstm.bias_hh.data.fill_(0)
        self.sig1 = nn.Sigmoid()
        self.train()

    def forward(self, inputs):
        inputs, (hx, cx) = inputs
        hx, cx = self.lstm(inputs, (hx, cx))
        x = self.sig1(self.fc1(hx))
        x = torch.tanh(self.fc2(x))
        return self.critic_linear(x), self.actor_linear(x), (hx, cx)
    
    def save(self, filename, directory):
        torch.save(self.state_dict(), '%s/%s_actor.pth' % (directory, filename))

    def load(self, filename, directory):
            self.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))

及以下是培训代码

def train(rank,params, model, optimizer,data):
    try:
        data = data.dropna()

        count = 0

        data = torch.DoubleTensor(np.asarray(data))

        env = ENV(params.state_dim, params.action_dim, data)
        print("env created\n")
        # init training variables
        max_timesteps = data.shape[0] - 1
        state = env.reset()
        done = True
        episode_length = 0
        count = 0
        while count<max_timesteps-1:
            episode_length += 1
            if done:
                cx = Variable(torch.zeros(1, params.state_dim))
                hx = Variable(torch.zeros(1, params.state_dim))
            else:
                cx = Variable(cx.data)
                hx = Variable(hx.data)

            values = []
            log_probs = []
            rewards = []
            entropies = []
            while count<max_timesteps-1:
                value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
                prob = F.softmax(action_values,dim = -1)
                log_prob = F.log_softmax(action_values, dim=-1).reshape(-1,)
                entropy = -(log_prob * prob).sum(1, keepdim=True)
                entropies.append(entropy)
                
                action = sample(prob)
                
                
                log_prob = log_prob.gather(0, Variable(action))
         
                state, reward, done = env.step(action)
                done = (done or count == max_timesteps-2)
                reward = max(min(reward, 1), -1)
                
                count +=1
                
                if done:
                    episode_length = 0
                    state = env.reset()
                    
                
                values.append(value)
                log_probs.append(log_prob)
                rewards.append(reward)
                print(ticker, "rank ",rank," action:",action, "reward ",reward)

                if done:
                    break
                
            R = torch.zeros(1, 1)
            if not done:
                value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
                R = value.data
            values.append(Variable(R))
            policy_loss = 0
            value_loss = 0
            R = Variable(R)
            gae = torch.zeros(1, 1)
            for i in reversed(range(len(rewards))):
                R = params.gamma * R + rewards[i]
                advantage = R - values[i]
                value_loss = value_loss + 0.5 * advantage.pow(2)
                TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data
                gae = gae * params.gamma * params.tau + TD
                policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i]

            optimizer.zero_grad()
            (policy_loss + 0.5 * value_loss).backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 40)
            optimizer.step()
            
    except:
        traceback.print_exc()

以下是用于采样动作的代码

def sample(logits):
    noise = torch.rand(logits.shape)
    return torch.argmax(logits - torch.log(-torch.log(noise)), 1)

0 个答案:

没有答案