我正试图教我嘈杂的deep-q网络生产几天的产品,进行保护并在必要时更换过滤器。
目标是最大化奖励并在给定范围内尽可能长地运行。
我创建了一个测试环境。我正在使用带有目标和评估网的嘈杂的深层q网络。但是网络根本无法稳定。即使奖励减少。
任何建议,怎么了?我尝试了一个PPO,在某些方面效果很好,但仍未达到最佳状态。网络似乎在过滤器交换方面存在问题。
我的状态变量:s = [产品,副产品(在此示例中不重要),过滤器容量,生产开始后的天数]
操作= 0:产品,1:交换过滤器,2:收获产品
培训的范围是120步。如果未在0.2之前交换滤镜,则该情节完成。
如您所见,步骤并没有像预期的那样停留在120左右。
任何想法我能做什么?状态空间太大了吗?
非常感谢您的输入和帮助:)
# -*- coding: utf-8 -*-
"""
noisy network based on https://github.com/cyoon1729/Reinforcement-learning/tree/master/Deep-Q-Networks/noisyDQN
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd as autograd
import torch.optim as optim
import torchvision.transforms as T
import numpy as np
import gym
from itertools import count
import math
import random
from collections import deque, namedtuple
from test_env import TestEnv
import matplotlib.pyplot as plt
import csv
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class FactorizedNoisyLinear(nn.Module):
def __init__(self, num_in, num_out, is_training=True):
super(FactorizedNoisyLinear, self).__init__()
self.num_in = num_in
self.num_out = num_out
self.is_training = is_training
self.mu_weight = nn.Parameter(torch.FloatTensor(num_out, num_in)).to(device)
self.mu_bias = nn.Parameter(torch.FloatTensor(num_out)).to(device)
self.sigma_weight = nn.Parameter(torch.FloatTensor(num_out, num_in)).to(device)
self.sigma_bias = nn.Parameter(torch.FloatTensor(num_out)).to(device)
self.register_buffer("epsilon_i", torch.FloatTensor(num_in))
self.register_buffer("epsilon_j", torch.FloatTensor(num_out))
self.reset_parameters()
self.reset_noise()
def forward(self, x):
self.reset_noise()
if self.is_training:
epsilon_weight = self.epsilon_j.ger(self.epsilon_i)
epsilon_bias = self.epsilon_j
weight = self.mu_weight + self.sigma_weight.mul(autograd.Variable(epsilon_weight))
bias = self.mu_bias + self.sigma_bias.mul(autograd.Variable(epsilon_bias))
else:
weight = self.mu_weight
bias = self.mu_bias
y = F.linear(x, weight, bias)
return y
def reset_parameters(self):
std = 1 / math.sqrt(self.num_in)
self.mu_weight.data.uniform_(-std, std)
self.mu_bias.data.uniform_(-std, std)
self.sigma_weight.data.fill_(0.5 / math.sqrt(self.num_in))
self.sigma_bias.data.fill_(0.5 / math.sqrt(self.num_in))
def reset_noise(self):
eps_i = torch.randn(self.num_in).to(device)
eps_j = torch.randn(self.num_out).to(device)
self.epsilon_i = eps_i.sign() * (eps_i.abs()).sqrt()
self.epsilon_j = eps_j.sign() * (eps_j.abs()).sqrt()
Transition = namedtuple('Transition',
('state', 'action', 'next_state', 'reward'))
class ReplayMemory(object):
def __init__(self, capacity):
self.capacity = capacity
self.memory = []
self.position = 0
def push(self, state,action,next_state,reward):
"""Saves a transition."""
state = torch.from_numpy(state).float().unsqueeze(0).to(device)
action = torch.tensor(action, dtype=torch.long, device=device)
if next_state is not None:
next_state = torch.from_numpy(next_state).float().unsqueeze(0).to(device)
if len(self.memory) < self.capacity:
self.memory.append(None)
self.memory[self.position] = Transition(state,action,next_state,reward)
self.position = (self.position + 1) % self.capacity
def sample(self, batch_size):
return random.sample(self.memory, batch_size)
def __len__(self):
return len(self.memory)
def clip_reward(state,reward):
x,y,z,e = state
if reward > 0:
return x*1
else:
if reward < -19000:
return 0.2
else:
return 0.5
def writecsv(line, ep):
with open('noisy_train/noisy_output_'+str(ep)+'.csv', 'a') as newFile:
newFileWriter = csv.writer(newFile)
newFileWriter.writerow(line)
def mini_batch_train(env, agent, max_episodes, max_steps, batch_size):
episode_rewards = []
for episode in range(max_episodes):
state = env.reset()
episode_reward = 0
real_cumreward = 0
for step in range(max_steps):
action = agent.get_action(state)
next_state, reward, done, _ = env.step(action)
real_cumreward += reward
if episode%10 == 0:
writecsv((action,state,next_state,reward,done,real_cumreward),episode)
reward = clip_reward(state,reward)
if step == max_steps-1 or done:
reward += 5
agent.replay_buffer.push(state, action, reward, next_state, done)
episode_reward += reward
if len(agent.replay_buffer) > batch_size:
agent.update(batch_size)
if done or step == max_steps-1:
episode_rewards.append(episode_reward)
print("Episode {} : {} ; steps : {}".format(episode, real_cumreward,step))
break
state = next_state
if episode%20 == 0 and episode>0:
plt.plot(episode_rewards)
plt.show()
return episode_rewards
class NoisyDQN(nn.Module):
def __init__(self, input_dim, output_dim):
super(NoisyDQN, self).__init__()
self.input_dim = input_dim
self.output_dim = output_dim
self.noisy_fc = nn.Sequential(
nn.Linear(self.input_dim[0], 252),
nn.ReLU(),
FactorizedNoisyLinear(252, 252),
nn.ReLU(),
FactorizedNoisyLinear(252, self.output_dim)
)
def forward(self, state):
qvals = self.noisy_fc(state).to(device)
return qvals
class NoisyDQNAgent:
def __init__(self, env, learning_rate=1e-4, gamma=0.99, buffer_maxlen=100000):
self.env = env
self.learning_rate = learning_rate
self.gamma = gamma
#self.replay_buffer = BasicBuffer(buffer_maxlen)
self.target_net_update_freq = 100
self.update_count = 0
self.memory = ReplayMemory(buffer_maxlen)
self.policy_net = NoisyDQN(self.env.observation_space.shape, self.env.action_space.n).to(device)
self.target_net = NoisyDQN(self.env.observation_space.shape, self.env.action_space.n).to(device)
self.target_net.load_state_dict(self.policy_net.state_dict())
self.target_net.eval()
self.MSE_loss = nn.MSELoss()
self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate)
def select_action(self, state):
sample = random.random()
eps_threshold = EPS_END + (EPS_START - EPS_END) * \
math.exp(-1. * self.update_count / EPS_DECAY)
self.update_count += 1
if sample > eps_threshold:
with torch.no_grad():
state = autograd.Variable(torch.FloatTensor(state).unsqueeze(0)).to(device)
qvals = self.policy_net.forward(state).to(device)
action = np.argmax(qvals.detach().cpu().numpy())
return torch.tensor([[action]], device=device, dtype=torch.long)
else:
return torch.tensor([[random.randrange(self.env.action_space.n)]], device=device, dtype=torch.long)
def optimize_model(self):
if len(self.memory) < BATCH_SIZE:
return
transitions = self.memory.sample(BATCH_SIZE)
# Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
# detailed explanation). This converts batch-array of Transitions
# to Transition of batch-arrays.
batch = Transition(*zip(*transitions))
# Compute a mask of non-final states and concatenate the batch elements
# (a final state would've been the one after which simulation ended)
non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
batch.next_state)), device=device, dtype=torch.uint8)
non_final_next_states = torch.cat([s for s in batch.next_state
if s is not None]).to(device)
state_batch = torch.cat(batch.state)
action_batch = torch.cat(batch.action)
reward_batch = torch.cat(batch.reward)
# Compute Q(s_t, a) - the model computes Q(s_t), then we select the
# columns of actions taken. These are the actions which would've been taken
# for each batch state according to policy_net
state_action_values = self.policy_net(state_batch).gather(1, action_batch)
# Compute V(s_{t+1}) for all next states.
# Expected values of actions for non_final_next_states are computed based
# on the "older" target_net; selecting their best reward with max(1)[0].
# This is merged based on the mask, such that we'll have either the expected
# state value or 0 in case the state was final.
next_state_values = torch.zeros(BATCH_SIZE, device=device)
next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1)[0].detach()
# Compute the expected Q values
expected_state_action_values = (next_state_values * GAMMA) + reward_batch
# Compute Huber loss
loss = F.smooth_l1_loss(state_action_values, expected_state_action_values.unsqueeze(1))
#loss = self.compute_loss(state_action_values, expected_state_action_values.unsqueeze(1))
# Optimize the model
self.optimizer.zero_grad()
loss.backward()
for param in self.policy_net.parameters():
param.grad.data.clamp_(-1, 1)
self.optimizer.step()
def plot_res(rewards,durations):
fig, (ax1, ax2) = plt.subplots(1,2)
ax1.plot(rewards)
ax1.set_title('Rewards')
ax2.plot(durations)
ax2.set_title('Steps')
plt.show()
#Hyper-Parameters
BATCH_SIZE = 128
GAMMA = 0.999
EPS_START = 0.9
EPS_END = 0.1
EPS_DECAY = 500
TARGET_UPDATE = 10
horizon = 120
env = TestEnv()
agent = NoisyDQNAgent(env)
episode_durations = []
episode_rewards = []
num_episodes = 1000
for i_episode in range(num_episodes):
# Initialize the environment and state
state = env.reset()
total_reward = 0
for t in range(horizon+1):
# Select and perform an action
action = agent.select_action(state)
next_state, reward, done, _ = env.step(action.item())
reward = clip_reward(state,reward)
if done:
next_state = None
reward = -1.0
total_reward += reward
reward = torch.tensor([reward], device=device)
# Store the transition in memory
agent.memory.push(state, action, next_state, reward)
# Move to the next state
# Perform one step of the optimization (on the target network)
agent.optimize_model()
if done:
break
state = next_state
# Update the target network, copying all weights and biases in DQN
episode_durations.append(t)
episode_rewards.append(total_reward)
if i_episode % TARGET_UPDATE == 0:
agent.target_net.load_state_dict(agent.policy_net.state_dict())
if i_episode % 20 == 0 and i_episode>0:
plot_res(episode_rewards,episode_durations)
print('Complete')
# -*- coding: utf-8 -*-
"""
test_env.py
"""
import math
import pickle
import gym
from gym import spaces
from gym.utils import seeding
import numpy as np
import random
import pandas as pd
from scipy.interpolate import interp1d
pdfs = {0: [0.0, 0.0],
1: [0.24705, 0.04705],
2: [0.06470000000000001, 0.001966666666666663],
3: [0.21175000000000002, 0.007850000000000015],
4: [0.32944999999999997, 0.007850000000000005],
5: [0.19410000000000005, 0.009799999999999994],
6: [0.40590000000000004, 0.009799999999999994],
7: [0.45294999999999985, 0.009816666666666621],
8: [0.4647000000000001, 0.001966666666666672],
9: [0.54115, 0.003916666666666717],
10: [0.33529999999999993, 0.001966666666666598],
11: [0.017649999999999944, 0.005883333333333314],
12: [-0.035150000000000015, 0.003950000000000046],
13: [-0.058350000000000124, 0.019783333333333337],
14: [0.005300000000000082, 0.00216666666666665]}
pdfs2 = {'x': {0: 0,
1: 1,
2: 2,
3: 3,
4: 4,
5: 5,
6: 6,
7: 7,
8: 8,
9: 9,
10: 10,
11: 11,
12: 12,
13: 13,
14: 14},
'average': {0: 1.01,
1: 0.94,
2: 1.068333333,
3: 1.098333333,
4: 1.001666667,
5: 1.19,
6: 1.361666667,
7: 1.163333333,
8: 1.12,
9: 0.9,
10: 0.86,
11: 0.935,
12: 1.198333333,
13: 1.658333333,
14: 1.7783333330000002},
'Stdv': {0: 0.06333333299999999,
1: 0.05,
2: 0.058333333,
3: 0.085,
4: 0.058333333,
5: 0.173333333,
6: 0.5216666670000001,
7: 0.55,
8: 0.69,
9: 0.61,
10: 0.4,
11: 0.12166666699999999,
12: 0.12166666699999999,
13: 0.145,
14: 0.081666667}}
def filter_decay(x):
x = round(x)
points1 = np.array( [[ 0., 1. ],
[10., 0.97964406],
[20., 0.92966907],
[30., 0.81829748],
[40., 0.67969409],
[50., 0.61382317]])
points2 = np.array([[ 0., 1. ],
[10., 0.94324923],
[20., 0.90694018],
[30., 0.78421843],
[40., 0.67743549],
[50., 0.58200272]])
# get x and y vectors
h1 = points1[:,0]
y1 = points1[:,1]
h2 = points2[:,0]
y2 = points2[:,1]
# calculate polynomial
f1 = interp1d(h1, y1)
f2 = interp1d(h2, y2)
a = (f1(50)/(1-50/85)/85)
b = f1(50)/(1-50/85)
if x == 0:
return 1
if 0 < x <= 50:
return(random.uniform(f2(x),f1(x)))
if x < 0:
return None
else:
return b-a*x+([-1,1][random.randrange(2)])*random.random()*(0.02)
class TestEnv(gym.Env):
def __init__(self):
self.pdfs = pdfs
self.pdfs2 = pdfs2
self.profit = 6000
self.Volume = 20000 #l
self.wv = 0.75
self.CV = 484 #l
self.filtercost = 6000 #$/l
self.dbc = 15 #g/l
self.CIP = 12833.31
self.Capture = 20541.19
self.Polish = 14939.42
self.Labor = 351949.37
self.fixcosts = 18410.23
self.n_actions = 3
self.n_states = 4
self.X = 4
self.Y = 4
self.Z = 0.2
#60 Cycles Replacement
self.D = 60
self.d = 0
self.E = 6
self.T = 365*3
self.growthBound = 8
self.action_space = spaces.Discrete(3)
self.observation_space = spaces.Box(-np.inf,np.inf,shape=(4,),dtype=np.float32)
self.seed()
self.viewer = None
self.state = None
#self.steps_beyond_done = None
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def rewardfunc(self,state,action):
"""
Boundary Conditions
Gives us 3 decisions:
0 = Produce
1 = Exchange Filter
2 = Harvest Production
"""
x,y,z,e = state
if action == 0:
reward = -self.fixcosts #€ per day
elif action == 1:
reward = -self.filtercost*self.CV-self.fixcosts
elif action == 2:
deltacycle = self.cyclenum(x,z)
reward = (self.profit*x*self.Volume*self.wv)-(self.CIP*deltacycle+self.Capture+self.Polish+self.Labor)-self.fixcosts*8
return reward
def step(self, action, reset=False):
"""
Boundary Conditions
Gives us 3 decisions:
0 = Produce
1 = Exchange Filter
2 = Harvest Production
"""
assert self.action_space.contains(action), "%r (%s) invalid"%(action, type(action))
state = self.state
x,y,z,e = state
if reset:
if e > self.E:
pdftime = 0
else:
pdftime = e
else:
if e > self.E:
pdftime = 0
else:
pdftime = e+8
alphax = np.random.normal(loc = self.pdfs[pdftime][0], scale=self.pdfs[pdftime][1])
betay = np.random.normal(loc = self.pdfs2["average"][pdftime], scale=self.pdfs2['Stdv'][pdftime])
#cyclenum = random.uniform(0.01,0.001)
deltacycle = self.cyclenum(x,z)
if action == 0:
e += 1
x += alphax
y = betay
reward = self.rewardfunc(state,action)
elif action == 1:
e += 1
x += alphax
y = betay
z = 1
self.d = 0
reward = self.rewardfunc(state,action)
else:
#x,y,e = 0,betay,0 ##old, long env
self.d += deltacycle
z = filter_decay(self.d)
if z <= 0:
z = 0.1
x,y,e = self.harvest_trans(z)
reward = self.rewardfunc(state,action)
self.state = np.array([x,y,z,e])
if reset:
done = False
else:
done = self.check_boundaries()
return self.state, reward, done, {}
def check_boundaries(self):
"""
Boundary Conditions
Gives us 3 decisions:
0 = Produce
1 = Exchange Filter
2 = Harvest Production
"""
x,y,z,e = self.state
if x >= self.X or y >= self.Y or e >= self.E:
return True
elif z <= self.Z:
return True
else:
return False
def harvest_trans(self,z):
#z = random.uniform(self.Z,1)
#if z > 0.97:
#z = 1
#d = self.get_starting_d(z)
y = np.random.normal(loc = self.pdfs2["average"][0], scale=self.pdfs2['Stdv'][1])
self.state = np.array([0,y,z,0])
for i in range(9):
next_state, reward, done, _ = self.step(0, reset=True)
x,y,z,e = self.state
e = 0
return x,y,e
def reset(self):
#z = random.uniform(self.Z,1)
#if z > 0.97:
#z = 1
#d = self.get_starting_d(z)
self.d = 0
y = np.random.normal(loc = self.pdfs2["average"][0], scale=self.pdfs2['Stdv'][1])
self.state = np.array([0,y,1,0])
for i in range(9):
next_state, reward, done, _ = self.step(0)
x,y,z,e = self.state
self.state = np.array([x,y,z,0])
return self.state
def cyclenum(self,x,z):
return math.ceil((x*self.Volume*self.wv)/((z*self.dbc*self.CV)))