Question

我正试图教我嘈杂的deep-q网络生产几天的产品，进行保护并在必要时更换过滤器。

目标是最大化奖励并在给定范围内尽可能长地运行。

我创建了一个测试环境。我正在使用带有目标和评估网的嘈杂的深层q网络。但是网络根本无法稳定。即使奖励减少。

任何建议，怎么了？我尝试了一个PPO，在某些方面效果很好，但仍未达到最佳状态。网络似乎在过滤器交换方面存在问题。

我的状态变量：s = [产品，副产品（在此示例中不重要），过滤器容量，生产开始后的天数]

操作= 0：产品，1：交换过滤器，2：收获产品

培训的范围是120步。如果未在0.2之前交换滤镜，则该情节完成。

如您所见，步骤并没有像预期的那样停留在120左右。

Rewards and steps per episode

任何想法我能做什么？状态空间太大了吗？

非常感谢您的输入和帮助：）

# -*- coding: utf-8 -*-
"""
noisy network based on https://github.com/cyoon1729/Reinforcement-learning/tree/master/Deep-Q-Networks/noisyDQN
"""

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim
import torchvision.transforms as T

import numpy as np
import gym
from itertools import count
import math
import random
from collections import deque, namedtuple
from test_env import TestEnv
import matplotlib.pyplot as plt
import csv

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class FactorizedNoisyLinear(nn.Module):

    def __init__(self, num_in, num_out, is_training=True):
        super(FactorizedNoisyLinear, self).__init__()
        self.num_in = num_in
        self.num_out = num_out 
        self.is_training = is_training

        self.mu_weight = nn.Parameter(torch.FloatTensor(num_out, num_in)).to(device)
        self.mu_bias = nn.Parameter(torch.FloatTensor(num_out)).to(device) 
        self.sigma_weight = nn.Parameter(torch.FloatTensor(num_out, num_in)).to(device)
        self.sigma_bias = nn.Parameter(torch.FloatTensor(num_out)).to(device)
        self.register_buffer("epsilon_i", torch.FloatTensor(num_in))
        self.register_buffer("epsilon_j", torch.FloatTensor(num_out))

        self.reset_parameters()
        self.reset_noise()

    def forward(self, x):
        self.reset_noise()

        if self.is_training:
            epsilon_weight = self.epsilon_j.ger(self.epsilon_i)
            epsilon_bias = self.epsilon_j
            weight = self.mu_weight + self.sigma_weight.mul(autograd.Variable(epsilon_weight))
            bias = self.mu_bias + self.sigma_bias.mul(autograd.Variable(epsilon_bias))
        else:
            weight = self.mu_weight
            bias = self.mu_bias

        y = F.linear(x, weight, bias)

        return y

    def reset_parameters(self):
        std = 1 / math.sqrt(self.num_in)
        self.mu_weight.data.uniform_(-std, std)
        self.mu_bias.data.uniform_(-std, std)

        self.sigma_weight.data.fill_(0.5 / math.sqrt(self.num_in))
        self.sigma_bias.data.fill_(0.5 / math.sqrt(self.num_in))

    def reset_noise(self):
        eps_i = torch.randn(self.num_in).to(device)
        eps_j = torch.randn(self.num_out).to(device)
        self.epsilon_i = eps_i.sign() * (eps_i.abs()).sqrt()
        self.epsilon_j = eps_j.sign() * (eps_j.abs()).sqrt()


Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.capacity = capacity
        self.memory = []
        self.position = 0

    def push(self, state,action,next_state,reward):
        """Saves a transition."""
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        action = torch.tensor(action, dtype=torch.long, device=device)
        if next_state is not None:
            next_state = torch.from_numpy(next_state).float().unsqueeze(0).to(device)
        if len(self.memory) < self.capacity:
            self.memory.append(None)
        self.memory[self.position] = Transition(state,action,next_state,reward)
        self.position = (self.position + 1) % self.capacity

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)




def clip_reward(state,reward):
    x,y,z,e = state
    if reward > 0:
        return x*1
    else:
        if reward < -19000:
            return 0.2
        else:
            return 0.5



def writecsv(line, ep):
    with open('noisy_train/noisy_output_'+str(ep)+'.csv', 'a') as newFile:
            newFileWriter = csv.writer(newFile)
            newFileWriter.writerow(line)    

def mini_batch_train(env, agent, max_episodes, max_steps, batch_size):
    episode_rewards = []

    for episode in range(max_episodes):
        state = env.reset()
        episode_reward = 0
        real_cumreward = 0

        for step in range(max_steps):
            action = agent.get_action(state)
            next_state, reward, done, _ = env.step(action)
            real_cumreward += reward
            if episode%10 == 0:
                writecsv((action,state,next_state,reward,done,real_cumreward),episode)
            reward = clip_reward(state,reward)
            if step == max_steps-1 or done:
                reward += 5
            agent.replay_buffer.push(state, action, reward, next_state, done)
            episode_reward += reward

            if len(agent.replay_buffer) > batch_size:
                agent.update(batch_size)   

            if done or step == max_steps-1:
                episode_rewards.append(episode_reward)
                print("Episode {} : {} ; steps : {}".format(episode, real_cumreward,step))
                break

            state = next_state
        if episode%20 == 0 and episode>0:
            plt.plot(episode_rewards)
            plt.show()

    return episode_rewards


class NoisyDQN(nn.Module):

    def __init__(self, input_dim, output_dim):
        super(NoisyDQN, self).__init__()
        self.input_dim = input_dim
        self.output_dim = output_dim

        self.noisy_fc = nn.Sequential(
            nn.Linear(self.input_dim[0], 252),
            nn.ReLU(),
            FactorizedNoisyLinear(252, 252),
            nn.ReLU(),
            FactorizedNoisyLinear(252, self.output_dim)
        )

    def forward(self, state):
        qvals = self.noisy_fc(state).to(device)

        return qvals



class NoisyDQNAgent:

    def __init__(self, env, learning_rate=1e-4, gamma=0.99, buffer_maxlen=100000):
        self.env = env
        self.learning_rate = learning_rate
        self.gamma = gamma
        #self.replay_buffer = BasicBuffer(buffer_maxlen)
        self.target_net_update_freq = 100
        self.update_count = 0
        self.memory = ReplayMemory(buffer_maxlen)

        self.policy_net = NoisyDQN(self.env.observation_space.shape, self.env.action_space.n).to(device)
        self.target_net = NoisyDQN(self.env.observation_space.shape, self.env.action_space.n).to(device)
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.target_net.eval()

        self.MSE_loss = nn.MSELoss()
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)


    def select_action(self, state):
        sample = random.random()
        eps_threshold = EPS_END + (EPS_START - EPS_END) * \
            math.exp(-1. * self.update_count / EPS_DECAY)
        self.update_count += 1
        if sample > eps_threshold:
            with torch.no_grad():
                state = autograd.Variable(torch.FloatTensor(state).unsqueeze(0)).to(device)
                qvals = self.policy_net.forward(state).to(device)
                action = np.argmax(qvals.detach().cpu().numpy())
                return torch.tensor([[action]], device=device, dtype=torch.long)
        else:
            return torch.tensor([[random.randrange(self.env.action_space.n)]], device=device, dtype=torch.long)


    def optimize_model(self):
        if len(self.memory) < BATCH_SIZE:
            return
        transitions = self.memory.sample(BATCH_SIZE)
        # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
        # detailed explanation). This converts batch-array of Transitions
        # to Transition of batch-arrays.
        batch = Transition(*zip(*transitions))

        # Compute a mask of non-final states and concatenate the batch elements
        # (a final state would've been the one after which simulation ended)
        non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
                                              batch.next_state)), device=device, dtype=torch.uint8)

        non_final_next_states = torch.cat([s for s in batch.next_state
                                                    if s is not None]).to(device)
        state_batch = torch.cat(batch.state)
        action_batch = torch.cat(batch.action)
        reward_batch = torch.cat(batch.reward)

        # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
        # columns of actions taken. These are the actions which would've been taken
        # for each batch state according to policy_net
        state_action_values = self.policy_net(state_batch).gather(1, action_batch)

        # Compute V(s_{t+1}) for all next states.
        # Expected values of actions for non_final_next_states are computed based
        # on the "older" target_net; selecting their best reward with max(1)[0].
        # This is merged based on the mask, such that we'll have either the expected
        # state value or 0 in case the state was final.
        next_state_values = torch.zeros(BATCH_SIZE, device=device)
        next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
        # Compute the expected Q values
        expected_state_action_values = (next_state_values * GAMMA) + reward_batch

        # Compute Huber loss
        loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
        #loss = self.compute_loss(state_action_values, expected_state_action_values.unsqueeze(1))

        # Optimize the model
        self.optimizer.zero_grad()
        loss.backward()
        for param in self.policy_net.parameters():
            param.grad.data.clamp_(-1, 1)
        self.optimizer.step()


def plot_res(rewards,durations):
    fig, (ax1, ax2)  = plt.subplots(1,2)
    ax1.plot(rewards)
    ax1.set_title('Rewards')
    ax2.plot(durations)
    ax2.set_title('Steps')
    plt.show()


#Hyper-Parameters
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.1
EPS_DECAY = 500
TARGET_UPDATE = 10

horizon = 120

env = TestEnv()
agent = NoisyDQNAgent(env)
episode_durations = []
episode_rewards = []

num_episodes = 1000
for i_episode in range(num_episodes):
    # Initialize the environment and state
    state = env.reset()
    total_reward = 0
    for t in range(horizon+1):
        # Select and perform an action
        action = agent.select_action(state)
        next_state, reward, done, _ = env.step(action.item())
        reward = clip_reward(state,reward)
        if done:
            next_state = None
            reward = -1.0
        total_reward += reward
        reward = torch.tensor([reward], device=device)
        # Store the transition in memory
        agent.memory.push(state, action, next_state, reward)

        # Move to the next state

        # Perform one step of the optimization (on the target network)
        agent.optimize_model()
        if done:
            break
        state = next_state
    # Update the target network, copying all weights and biases in DQN
    episode_durations.append(t)
    episode_rewards.append(total_reward)
    if i_episode % TARGET_UPDATE == 0:
        agent.target_net.load_state_dict(agent.policy_net.state_dict())
    if i_episode % 20 == 0 and i_episode>0:
        plot_res(episode_rewards,episode_durations)

print('Complete')

# -*- coding: utf-8 -*-
"""
test_env.py
"""
import math
import pickle
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np

import random
import pandas as pd
from scipy.interpolate import interp1d

pdfs = {0: [0.0, 0.0],
 1: [0.24705, 0.04705],
 2: [0.06470000000000001, 0.001966666666666663],
 3: [0.21175000000000002, 0.007850000000000015],
 4: [0.32944999999999997, 0.007850000000000005],
 5: [0.19410000000000005, 0.009799999999999994],
 6: [0.40590000000000004, 0.009799999999999994],
 7: [0.45294999999999985, 0.009816666666666621],
 8: [0.4647000000000001, 0.001966666666666672],
 9: [0.54115, 0.003916666666666717],
 10: [0.33529999999999993, 0.001966666666666598],
 11: [0.017649999999999944, 0.005883333333333314],
 12: [-0.035150000000000015, 0.003950000000000046],
 13: [-0.058350000000000124, 0.019783333333333337],
 14: [0.005300000000000082, 0.00216666666666665]}

pdfs2 = {'x': {0: 0,
  1: 1,
  2: 2,
  3: 3,
  4: 4,
  5: 5,
  6: 6,
  7: 7,
  8: 8,
  9: 9,
  10: 10,
  11: 11,
  12: 12,
  13: 13,
  14: 14},
 'average': {0: 1.01,
  1: 0.94,
  2: 1.068333333,
  3: 1.098333333,
  4: 1.001666667,
  5: 1.19,
  6: 1.361666667,
  7: 1.163333333,
  8: 1.12,
  9: 0.9,
  10: 0.86,
  11: 0.935,
  12: 1.198333333,
  13: 1.658333333,
  14: 1.7783333330000002},
 'Stdv': {0: 0.06333333299999999,
  1: 0.05,
  2: 0.058333333,
  3: 0.085,
  4: 0.058333333,
  5: 0.173333333,
  6: 0.5216666670000001,
  7: 0.55,
  8: 0.69,
  9: 0.61,
  10: 0.4,
  11: 0.12166666699999999,
  12: 0.12166666699999999,
  13: 0.145,
  14: 0.081666667}}


def filter_decay(x):
    x = round(x)
    points1 = np.array(  [[ 0.,          1.        ],
                         [10.,          0.97964406],
                         [20.,          0.92966907],
                         [30.,          0.81829748],
                         [40.,          0.67969409],
                         [50.,          0.61382317]])

    points2 = np.array([[ 0.,          1.        ],
                         [10.,          0.94324923],
                         [20.,          0.90694018],
                         [30.,          0.78421843],
                         [40.,          0.67743549],
                         [50.,          0.58200272]])
    # get x and y vectors
    h1 = points1[:,0]
    y1 = points1[:,1]
    h2 = points2[:,0]
    y2 = points2[:,1]
    # calculate polynomial
    f1 = interp1d(h1, y1)
    f2 = interp1d(h2, y2)
    a = (f1(50)/(1-50/85)/85)
    b = f1(50)/(1-50/85)
    if x == 0:
        return 1
    if 0 < x <= 50:
        return(random.uniform(f2(x),f1(x)))
    if x < 0:
        return None
    else:
        return b-a*x+([-1,1][random.randrange(2)])*random.random()*(0.02)


class TestEnv(gym.Env):


    def __init__(self):

        self.pdfs = pdfs

        self.pdfs2 = pdfs2

        self.profit = 6000

        self.Volume = 20000 #l
        self.wv = 0.75 

        self.CV = 484 #l
        self.filtercost = 6000 #$/l
        self.dbc = 15 #g/l

        self.CIP = 12833.31
        self.Capture = 20541.19
        self.Polish = 14939.42
        self.Labor = 351949.37

        self.fixcosts = 18410.23


        self.n_actions = 3
        self.n_states = 4


        self.X = 4
        self.Y = 4
        self.Z = 0.2
        #60 Cycles Replacement
        self.D = 60
        self.d = 0
        self.E = 6
        self.T = 365*3
        self.growthBound = 8


        self.action_space = spaces.Discrete(3)
        self.observation_space = spaces.Box(-np.inf,np.inf,shape=(4,),dtype=np.float32)

        self.seed()
        self.viewer = None
        self.state = None

        #self.steps_beyond_done = None

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def rewardfunc(self,state,action):
        """
        Boundary Conditions
        Gives us 3 decisions:
            0 = Produce
            1 = Exchange Filter
            2 = Harvest Production
        """
        x,y,z,e = state
        if action == 0:
            reward = -self.fixcosts #€ per day
        elif action == 1:
            reward = -self.filtercost*self.CV-self.fixcosts
        elif action == 2:
            deltacycle = self.cyclenum(x,z)
            reward = (self.profit*x*self.Volume*self.wv)-(self.CIP*deltacycle+self.Capture+self.Polish+self.Labor)-self.fixcosts*8
        return reward


    def step(self, action, reset=False):
        """
        Boundary Conditions
        Gives us 3 decisions:
            0 = Produce
            1 = Exchange Filter
            2 = Harvest Production
        """
        assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))

        state = self.state
        x,y,z,e = state
        if reset:
            if e > self.E:
                pdftime = 0
            else:
                pdftime = e
        else:
            if e > self.E:
                pdftime = 0
            else:
                pdftime = e+8
        alphax = np.random.normal(loc = self.pdfs[pdftime][0], scale=self.pdfs[pdftime][1])
        betay = np.random.normal(loc = self.pdfs2["average"][pdftime], scale=self.pdfs2['Stdv'][pdftime])
        #cyclenum = random.uniform(0.01,0.001)
        deltacycle = self.cyclenum(x,z)
        if action == 0:
            e += 1
            x += alphax
            y = betay
            reward = self.rewardfunc(state,action)
        elif action == 1:
            e += 1
            x += alphax
            y = betay
            z = 1
            self.d = 0
            reward = self.rewardfunc(state,action)
        else:
            #x,y,e = 0,betay,0 ##old, long env
            self.d += deltacycle
            z = filter_decay(self.d)
            if z <= 0:
                z = 0.1
            x,y,e = self.harvest_trans(z)
            reward = self.rewardfunc(state,action)
        self.state = np.array([x,y,z,e])
        if reset:
            done = False
        else:
            done = self.check_boundaries()
        return self.state, reward, done, {}

    def check_boundaries(self):
        """
        Boundary Conditions
        Gives us 3 decisions:
            0 = Produce
            1 = Exchange Filter
            2 = Harvest Production
        """
        x,y,z,e = self.state
        if x >= self.X or y >= self.Y or e >= self.E:
            return True
        elif z <= self.Z:
            return True
        else:
            return False


    def harvest_trans(self,z):
        #z = random.uniform(self.Z,1)

        #if z > 0.97:
            #z = 1
        #d = self.get_starting_d(z)
        y = np.random.normal(loc = self.pdfs2["average"][0], scale=self.pdfs2['Stdv'][1])
        self.state = np.array([0,y,z,0])
        for i in range(9):
            next_state, reward, done, _ = self.step(0, reset=True)
        x,y,z,e = self.state
        e = 0
        return x,y,e

    def reset(self):
        #z = random.uniform(self.Z,1)

        #if z > 0.97:
            #z = 1
        #d = self.get_starting_d(z)
        self.d = 0
        y = np.random.normal(loc = self.pdfs2["average"][0], scale=self.pdfs2['Stdv'][1])
        self.state = np.array([0,y,1,0])
        for i in range(9):
            next_state, reward, done, _ = self.step(0)
        x,y,z,e = self.state
        self.state = np.array([x,y,z,0])


        return self.state

    def cyclenum(self,x,z):
        return math.ceil((x*self.Volume*self.wv)/((z*self.dbc*self.CV)))

如何在pytorch中稳定我的Deep Q网络？

0 个答案: