我正在使用pytorch制作具有4个过程的A3C,如下代码所示。
但是令我惊讶的是,在训练A3C动作值时,所有动作都变成了nan。 最初,动作值不是nan。
但是经过一整夜的训练后,它进入了南。 有人可以帮忙让我知道那里是什么问题。
class SharedAdam(torch.optim.Adam):
def __init__(self, params, lr=1e-3, betas=(0.9, 0.99), eps=1e-8,
weight_decay=0):
super(SharedAdam, self).__init__(params, lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
# State initialization
for group in self.param_groups:
for p in group['params']:
state = self.state[p]
state['step'] = 0
state['exp_avg'] = torch.zeros_like(p.data)
state['exp_avg_sq'] = torch.zeros_like(p.data)
# share in memory
state['exp_avg'].share_memory_()
state['exp_avg_sq'].share_memory_()
class ActorCritic(torch.nn.Module):
def __init__(self, num_inputs, action_space):
super(ActorCritic, self).__init__()
self.num_inputs = num_inputs
self.action_space = action_space
self.lstm = nn.LSTMCell(num_inputs, num_inputs)
num_outputs = action_space
self.fc1 = nn.Linear(num_inputs, 256)
self.fc1.apply(init_weights)
self.fc2 = nn.Linear(256, 256)
self.fc2.apply(init_weights)
self.critic_linear = nn.Linear(256, 1)
self.critic_linear.apply(init_weights)
self.actor_linear = nn.Linear(256, num_outputs)
self.actor_linear.apply(init_weights)
self.lstm.bias_ih.data.fill_(0)
self.lstm.bias_hh.data.fill_(0)
self.sig1 = nn.Sigmoid()
self.train()
def forward(self, inputs):
inputs, (hx, cx) = inputs
hx, cx = self.lstm(inputs, (hx, cx))
x = self.sig1(self.fc1(hx))
x = torch.tanh(self.fc2(x))
return self.critic_linear(x), self.actor_linear(x), (hx, cx)
def save(self, filename, directory):
torch.save(self.state_dict(), '%s/%s_actor.pth' % (directory, filename))
def load(self, filename, directory):
self.load_state_dict(torch.load('%s/%s_actor.pth' % (directory, filename)))
及以下是培训代码
def train(rank,params, model, optimizer,data):
try:
data = data.dropna()
count = 0
data = torch.DoubleTensor(np.asarray(data))
env = ENV(params.state_dim, params.action_dim, data)
print("env created\n")
# init training variables
max_timesteps = data.shape[0] - 1
state = env.reset()
done = True
episode_length = 0
count = 0
while count<max_timesteps-1:
episode_length += 1
if done:
cx = Variable(torch.zeros(1, params.state_dim))
hx = Variable(torch.zeros(1, params.state_dim))
else:
cx = Variable(cx.data)
hx = Variable(hx.data)
values = []
log_probs = []
rewards = []
entropies = []
while count<max_timesteps-1:
value, action_values, (hx, cx) = model((Variable(state.unsqueeze(0)), (hx, cx)))
prob = F.softmax(action_values,dim = -1)
log_prob = F.log_softmax(action_values, dim=-1).reshape(-1,)
entropy = -(log_prob * prob).sum(1, keepdim=True)
entropies.append(entropy)
action = sample(prob)
log_prob = log_prob.gather(0, Variable(action))
state, reward, done = env.step(action)
done = (done or count == max_timesteps-2)
reward = max(min(reward, 1), -1)
count +=1
if done:
episode_length = 0
state = env.reset()
values.append(value)
log_probs.append(log_prob)
rewards.append(reward)
print(ticker, "rank ",rank," action:",action, "reward ",reward)
if done:
break
R = torch.zeros(1, 1)
if not done:
value, _, _ = model((Variable(state.unsqueeze(0)), (hx, cx)))
R = value.data
values.append(Variable(R))
policy_loss = 0
value_loss = 0
R = Variable(R)
gae = torch.zeros(1, 1)
for i in reversed(range(len(rewards))):
R = params.gamma * R + rewards[i]
advantage = R - values[i]
value_loss = value_loss + 0.5 * advantage.pow(2)
TD = rewards[i] + params.gamma * values[i + 1].data - values[i].data
gae = gae * params.gamma * params.tau + TD
policy_loss = policy_loss - log_probs[i] * Variable(gae) - 0.01 * entropies[i]
optimizer.zero_grad()
(policy_loss + 0.5 * value_loss).backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 40)
optimizer.step()
except:
traceback.print_exc()
以下是用于采样动作的代码
def sample(logits):
noise = torch.rand(logits.shape)
return torch.argmax(logits - torch.log(-torch.log(noise)), 1)