我正在尝试通过OpenAI向上学习深度强化学习。为此,我想使用pytorch而不是tensorflow重写他们的一些代码。 目前,我正在尝试将代码转换为基本策略梯度(link with explanations),这是到目前为止的代码:
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax
from torch.distributions import Categorical
import torch.optim as optim
import numpy as np
import gym
from gym.spaces import Discrete, Box
class Policy(nn.Module):
def __init__(self, sizes, activation=nn.Tanh(), output_activation=None):
# Build a feedforward neural network.
super(Policy, self).__init__()
self.layers=nn.ModuleList([nn.Linear(sizes[i],sizes[i+1]) for i in
range(len(sizes)-1)])
self.activation=activation
self.output_activation=output_activation
self.returns=[] # for R(tau) weighting in policy gradient
self.rewards=[] # list for rewards accrued throughout ep
self.logits=[] # for measuring episode logits
def forward(self,x):
for layer in self.layers[:-1]:
x=self.activation(layer(x))
x=self.layers[-1](x)
if not self.output_activation==None:
x=self.output_activation(self.layers[-1](x))
return x
# make action selection op (outputs int actions, sampled from policy)
def select_action(logits):
return Categorical(logits=logits).sample()
# make loss function whose gradient, for the right data, is policy gradient
def loss(action_logits,tau_rets):
return torch.sum(torch.dot(log_softmax(action_logits),tau_rets))
def train(env_name='CartPole-v0', hidden_sizes=[32], lr=1e-2,
epochs=50, batch_size=5000, render=False):
# make environment, check spaces, get obs / act dims
env = gym.make(env_name)
assert isinstance(env.observation_space, Box), \
"This example only works for envs with continuous state spaces."
assert isinstance(env.action_space, Discrete), \
"This example only works for envs with discrete action spaces."
obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.n
# make core of policy network
policy = Policy(sizes=[obs_dim]+hidden_sizes+[n_acts])
# make train op
train_op = optim.Adam(policy.parameters(), lr=lr)
# for training policy
def train_one_epoch():
# make some empty lists for logging.
batch_returns = [] # for measuring episode returns
batch_lens = [] # for measuring episode lengths
# reset episode-specific variables
obs = torch.from_numpy(env.reset()).type(torch.FloatTensor) # first obs comes from starting distribution
done = False # signal from environment that episode is over
num_obs=0 # to measure the number of observations
# render first episode of each epoch
finished_rendering_this_epoch = False
# collect experience by acting in the environment with current policy
while True:
# rendering
if (not finished_rendering_this_epoch) and render:
env.render()
# act in the environment
act_logit=policy.forward(obs)
act = select_action(act_logit)
tmp, reward, done, _ = env.step(act.numpy())
obs=torch.from_numpy(tmp).type(torch.FloatTensor)
num_obs+=1
# save logit, reward
policy.rewards.append(reward)
policy.logits.append(act_logit[act].item())
if done:
# if episode is over, record info about episode
ep_ret, ep_len = sum(policy.rewards), len(policy.rewards)
batch_returns.append(ep_ret)
batch_lens.append(ep_len)
# the weight for each logprob(a|s) is R(tau)
policy.returns+= [ep_ret] * ep_len
# reset episode-specific variables
tmp, done, policy.rewards = env.reset(), False, []
obs=torch.from_numpy(tmp).type(torch.FloatTensor)
# won't render again this epoch
finished_rendering_this_epoch = True
# end experience loop if we have enough of it
if num_obs > batch_size:
break
# take a single policy gradient update step
print (len(policy.returns),len(policy.rewards),len(policy.logits))
batch_loss = loss(torch.tensor(policy.logits),torch.tensor(policy.returns))
batch_loss.backward()
return batch_loss, batch_returns, batch_lens
# training loop
for i in range(epochs):
batch_loss, batch_rets, batch_lens = train_one_epoch()
print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
(i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
当我运行train()时,出现以下错误:
RuntimeError Traceback (most recent call last)
<ipython-input-163-2da0ffaf5447> in <module>()
----> 1 train()
<ipython-input-162-560e772be08b> in train(env_name, hidden_sizes, lr, epochs,
batch_size, render)
114 # training loop
115 for i in range(epochs):
--> 116 batch_loss, batch_rets, batch_lens = train_one_epoch()
117 print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
118 (i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
<ipython-input-162-560e772be08b> in train_one_epoch()
109 print (len(policy.returns),len(policy.rewards),len(policy.logits))
110 batch_loss = loss(torch.tensor(policy.logits),torch.tensor(policy.returns))
--> 111 batch_loss.backward()
112 return batch_loss, batch_returns, batch_lens
113
~\Anaconda3\lib\site-packages\torch\tensor.py in backward(self, gradient,
retain_graph, create_graph)
91 products. Defaults to ``False``.
92 """
---> 93 torch.autograd.backward(self, gradient, retain_graph, create_graph)
94
95 def register_hook(self, hook):
~\Anaconda3\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
88 Variable._execution_engine.run_backward(
89 tensors, grad_tensors, retain_graph, create_graph,
---> 90 allow_unreachable=True) # allow_unreachable flag
91
92
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
我不明白为什么会这样,因为我的代码类似于其他this等rl pytorch代码。