我正在尝试从购买的一本书中编写有关在Pytorch中进行强化学习的代码。 代码应该按照书上的内容工作,但是对我而言,模型无法收敛,回报仍然是负数。它还会收到以下用户警告:
/home/user/.local/lib/python3.6/site-packages/ipykernel_launcher.py:30: UserWarning: Using a target size (torch.Size([])) that is different to the input size (torch.Size([1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
我是Pytorch的一个完整的初学者,但是我假设size([])不是有效的张量大小?我认为代码中出现了问题,但是经过一段时间尝试后,我还没有找到任何东西。不久前,我还给图书出版商发了消息,但不幸的是我没有收到他们的回音。
这就是为什么我想在这里问是否有人见过此错误,也许知道如何解决该错误?
该代码用于在山地车健身房环境中实施A2C强化学习。我也可以在这里找到:https://github.com/PacktPublishing/PyTorch-1.x-Reinforcement-Learning-Cookbook/blob/master/Chapter08/chapter8/actor_critic_mountaincar.py
'''
Source codes for PyTorch 1.0 Reinforcement Learning (Packt Publishing)
Chapter 8: Implementing Policy Gradients and Policy Optimization
Author: Yuxi (Hayden) Liu
'''
import torch
import gym
import torch.nn as nn
import torch.nn.functional as F
env = gym.make('MountainCarContinuous-v0')
class ActorCriticModel(nn.Module):
def __init__(self, n_input, n_output, n_hidden):
super(ActorCriticModel, self).__init__()
self.fc = nn.Linear(n_input, n_hidden)
self.mu = nn.Linear(n_hidden, n_output)
self.sigma = nn.Linear(n_hidden, n_output)
self.value = nn.Linear(n_hidden, 1)
self.distribution = torch.distributions.Normal
def forward(self, x):
x = F.relu(self.fc(x))
mu = 2 * torch.tanh(self.mu(x))
sigma = F.softplus(self.sigma(x)) + 1e-5
dist = self.distribution(mu.view(1, ).data, sigma.view(1, ).data)
value = self.value(x)
return dist, value
class PolicyNetwork():
def __init__(self, n_state, n_action, n_hidden, lr=0.001):
self.model = ActorCriticModel(n_state, n_action, n_hidden)
self.optimizer = torch.optim.Adam(self.model.parameters(), lr)
def update(self, returns, log_probs, state_values):
"""
Update the weights of the Actor Critic network given the training samples
@param returns: return (cumulative rewards) for each step in an episode
@param log_probs: log probability for each step
@param state_values: state-value for each step
"""
loss = 0
for log_prob, value, Gt in zip(log_probs, state_values, returns):
advantage = Gt - value.item()
policy_loss = - log_prob * advantage
value_loss = F.smooth_l1_loss(value, Gt)
loss += policy_loss + value_loss
self.optimizer.zero_grad()
loss.backward()
self.optimizer.step()
def predict(self, s):
"""
Compute the output using the continuous Actor Critic model
@param s: input state
@return: Gaussian distribution, state_value
"""
self.model.training = False
return self.model(torch.Tensor(s))
def get_action(self, s):
"""
Estimate the policy and sample an action, compute its log probability
@param s: input state
@return: the selected action, log probability, predicted state-value
"""
dist, state_value = self.predict(s)
action = dist.sample().numpy()
log_prob = dist.log_prob(action[0])
return action, log_prob, state_value
def actor_critic(env, estimator, n_episode, gamma=1.0):
"""
continuous Actor Critic algorithm
@param env: Gym environment
@param estimator: policy network
@param n_episode: number of episodes
@param gamma: the discount factor
"""
for episode in range(n_episode):
log_probs = []
rewards = []
state_values = []
state = env.reset()
while True:
state = scale_state(state)
action, log_prob, state_value = estimator.get_action(state)
action = action.clip(env.action_space.low[0],
env.action_space.high[0])
next_state, reward, is_done, _ = env.step(action)
total_reward_episode[episode] += reward
log_probs.append(log_prob)
state_values.append(state_value)
rewards.append(reward)
if is_done:
returns = []
Gt = 0
pw = 0
for reward in rewards[::-1]:
Gt += gamma ** pw * reward
pw += 1
returns.append(Gt)
returns = returns[::-1]
returns = torch.tensor(returns)
returns = (returns - returns.mean()) / (returns.std() + 1e-9)
estimator.update(returns, log_probs, state_values)
print('Episode: {}, total reward: {}'.format(episode, total_reward_episode[episode]))
break
state = next_state
import sklearn.preprocessing
import numpy as np
state_space_samples = np.array(
[env.observation_space.sample() for x in range(10000)])
scaler = sklearn.preprocessing.StandardScaler()
scaler.fit(state_space_samples)
def scale_state(state):
scaled = scaler.transform([state])
return scaled[0]
n_state = env.observation_space.shape[0]
n_action = 1
n_hidden = 128
lr = 0.0003
policy_net = PolicyNetwork(n_state, n_action, n_hidden, lr)
n_episode = 200
gamma = 0.9
total_reward_episode = [0] * n_episode
actor_critic(env, policy_net, n_episode, gamma)
答案 0 :(得分:2)
size([]) 是有效的,但它表示单个值,而不是数组,而 size([1]) 是一个仅包含一个 item 的一维数组。这就像将 5 与 [5] 进行比较。一种解决方案是
returns = returns[::-1]
returns_amount = len(returns)
returns = torch.tensor(returns)
returns = (returns - returns.mean()) / (returns.std() + 1e-9)
returns.resize_(returns_amount, 1)
这将返回转换为二维数组,因此您从中获得的每个 Gt 将是一维数组,而不是浮点数。
答案 1 :(得分:-1)
您有答案。使用与输入大小(torch.Size([1]))不同的目标大小(torch.Size([])) 将输入形状更改为view(1)