pytorch强化学习中改变输入类型的问题

时间:2021-04-01 13:33:47

标签: pytorch

我正在尝试调整我在 github 上找到的代码,但一直在某一部分崩溃并收到 TypeError: only size-1 arrays can be converted to Python scalars 错误。我已经尝试解决它两天了。如果我正确理解了问题,我的张量 dtypes 不兼容,但我不知道如何解决这个问题。每次我尝试将张量类型更改为 DoubleTensor 或 float64 时,我都会遇到其他错误。我现在根本不知道哪个部分需要更改以及如何更改。

这是我定义的模型:

class Policy(nn.Module):
    def __init__(self):      
        super(Policy, self).__init__()
        self.input_layer = nn.Linear(11, 128)
        self.hidden_1 = nn.Linear(128, 128)
        self.hidden_2 = nn.Linear(32,31)
        self.hidden_state = torch.tensor(torch.zeros(2,1,32)).cuda()
        self.rnn = nn.GRU(128, 32, 2)
        self.action_head = nn.Linear(31, 5)
        self.value_head = nn.Linear(31, 1)
        self.saved_actions = []
        self.rewards = []

    def reset_hidden(self):
        self.hidden_state = torch.tensor(torch.zeros(2,1,32)).cuda()
        
    def forward(self, x): 
        x = torch.tensor(x).cuda()
        x = torch.sigmoid(self.input_layer(x))
        x = torch.tanh(self.hidden_1(x))
        x, self.hidden_state = self.rnn(x.view(1,-1,128), self.hidden_state.data)
        x = F.relu(self.hidden_2(x.squeeze()))
        action_scores = self.action_head(x)
        state_values = self.value_head(x)
        return F.softmax(action_scores, dim=-1), state_values

        def forward(self, x):
          conv_out = self.conv(x).view(x.size()[0], -1)
          val = self.fc_val(conv_out)
          adv = self.fc_adv(conv_out)
          return val + (adv - adv.mean(dim=1, keepdim=True))
    
    def act(self, state):
        probs, state_value = self.forward(state)
        m = Categorical(probs)
        action = m.sample()
        if action == 1 and env.state[0] < 1: action = torch.LongTensor([2]).squeeze().cuda.DoubleTensor()
        if action == 4 and env.state[1] < 1: action = torch.LongTensor([2]).squeeze().cuda.DoubleTensor()
        if action == 6 and env.state[2] < 1: action = torch.LongTensor([2]).squeeze().cuda.DoubleTensor()
        self.saved_actions.append((m.log_prob(action), state_value))
        return action.item()

这是它崩溃的地方

env.reset()
# In case you're running this a second time with the same model, delete the gradients
del model.rewards[:]
del model.saved_actions[:]

gamma = 0.9
log_interval = 60

def finish_episode():
    R = 0
    saved_actions = model.saved_actions
    policy_losses = []
    value_losses = []
    rewards = []
    for r in model.rewards[::-1]:
        R = r + (gamma * R)
        rewards.insert(0, R)
    rewards = torch.tensor(rewards)
    
    epsilon = (torch.rand(1) / 1e4) - 5e-5
    # With different architectures, I found the following standardization step sometimes
    # helpful, sometimes unhelpful.
    #
    rewards = (rewards - rewards.mean()) / (rewards.std(unbiased=False) + epsilon)
    # Alternatively, comment it out and use the following line instead:
    rewards += epsilon
    
    for (log_prob, value), r in zip(saved_actions, rewards):
        reward = torch.tensor(r - value.item()).cuda()
        policy_losses.append(-log_prob * reward)
        value_losses.append(F.smooth_l1_loss(value, torch.tensor([r]).cuda()))
        
    optimizer.zero_grad()
    loss = torch.stack(policy_losses).sum() + torch.stack(value_losses).sum()
    loss = torch.clamp(loss, -1e-5, 1e5)
    loss.backward()
    optimizer.step()
    del model.rewards[:]
    del model.saved_actions[:]

running_reward = 0
for episode in range(0, 4000):
    state = env.reset()
    state = state.type(torch.float32)
    reward = 0
    done = False
    msg = None 
    while not done:
        action = model.act(state)
        state, reward, done, msg = env.step(action)
        model.rewards.append(reward)
        if done:
            break
    running_reward = running_reward * (1 - 1/log_interval) + reward * (1/log_interval)
    finish_episode()
    # Resetting the hidden state seems unnecessary - it's effectively random from the previous
    # episode anyway, more random than a bunch of zeros.
    # model.reset_hidden()
    if msg["msg"] == "done" and env.portfolio_value() > env.starting_portfolio_value * 1.1 and running_reward > 500:
        print("Early Stopping: " + str(int(reward)))
        break
    if episode % log_interval == 0:
        print("""Episode {}: started at {:.1f}, finished at {:.1f} because {} @ t={}, \
last reward {:.1f}, running reward {:.1f}""".format(episode, env.starting_portfolio_value, \
              env.portfolio_value(), msg["msg"], env.cur_timestep, reward, running_reward))

这是我得到的错误

/usr/local/lib/python3.7/dist-packages/ipykernel_launcher.py:18: UserWarning: To copy construct from a tensor, it is recommended to use sourceTensor.clone().detach() or sourceTensor.clone().detach().requires_grad_(True), rather than torch.tensor(sourceTensor).
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-74-21b617d2e36f> in <module>()
     46     msg = None
     47     while not done:
---> 48         action = model.act(state)
     49         state, reward, done, msg = env.step(action)
     50         model.rewards.append(reward)

4 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
   1751     if has_torch_function_variadic(input, weight):
   1752         return handle_torch_function(linear, (input, weight), input, weight, bias=bias)
-> 1753     return torch._C._nn.linear(input, weight, bias)
   1754 
   1755 

RuntimeError: expected scalar type Double but found Float

这是我使用的github: https://github.com/tomgrek/RL-stocktrading/blob/master/Finance%20final.ipynb

有人建议我这个问题是因为当我使用“state”作为参数时,torch会抛出一个错误,因为它需要一个数字类型,但我不能将状态更改为任何浮点数,因为我得到另一个列表不能的错误改为 float32。

如果你能证明我做错了什么,我将不胜感激。

1 个答案:

答案 0 :(得分:0)

首先,您是否使用相同的环境“env”和/或数据集?

其次,您添加了这一行 state = state.type(torch.float32) 并且它没有抛出错误,所以我认为 state 已经是一个张量(这有点奇怪)。如果您必须将类型更改为 float32,那么在下一个 while 循环中您可能忘记更改类型。

while not done:
    action = model.act(state)  
    state, reward, done, msg = env.step(action)  
    state = state.float()  # To add as I think env.step(action) returns a long tensor for some reason
    model.rewards.append(reward)  
    if done:  
        break

祝你好运。