我创建了一个游戏,试图让我的电脑更容易理解。
以下是规则:
假设网络为每个玩家选择一个人,然后为每个玩家选择一个角色,然后为每个玩家选择一个通道。网络可以按照自己喜欢的任何顺序进行操作。
如果网络尝试执行以下任一操作,它将失败:
我为第n个选项(5个玩家* 10个选择+ 5个玩家* 2个角色选择+ 5个玩家* 5个位置选择)创建了一个[0]数组,即50 + 10 + 25 = 85个[ 0]。
一旦采取行动,该选择的env_state就会更新为1,并输入到下一个预测中。
在每个“游戏”中发生以下情况:
如果选择正确的团队格式,则将获得0-1的奖励。 如果未能选择正确的团队格式,则得到-1。
目标是获得尽可能最高的奖励。
我让网络训练了大约24小时,超过100,000个纪元,并且无法学习。我已经从头开始了整个项目3次,而且我经常得到类似的糟糕结果。
我已经从https://github.com/keras-rl/keras-rl处修改了代码,但是由于我不需要网络来了解过去,所以删除了Continuous_frames部分。
我花了大约3周的时间,并通过数百个Stackflow和Medium教程真正用完了示例或信息。
我确实认为softmax将是最好的选择,因为将来我的目标是给每个操作百分比机会。
总体问题是,网络似乎从来没有学过第一步或第二步。我注意到它会选择一个动作,下一步还将预测相同的动作。在训练期间,这是不可能的,并且总是获得-1奖励。
import sys
import random
import random as r
import numpy as np
from tqdm import tqdm
from .agent import Agent
from random import random, randrange
from utils.memory_buffer import MemoryBuffer
from utils.networks import tfSummary
from utils.stats import gather_stats
class DDQN:
""" Deep Q-Learning Main Algorithm
"""
def __init__(self, action_dim, state_dim, args):
""" Initialization
"""
# Environment and DDQN parameters
np.set_printoptions(threshold=sys.maxsize)
self.with_per = args.with_per
self.action_dim = action_dim
self.state_dim = state_dim
#
self.lr = 2.5e-4
self.gamma = 0.95
self.epsilon = 0.95
self.epsilon_decay = 0.001
self.buffer_size = 20000
#
#if(len(state_dim) < 3):
# self.tau = 1e-2
#else:
self.tau = 1.0
# Create actor and critic networks
self.agent = Agent(self.state_dim, action_dim, self.lr, self.tau, args.dueling)
# Memory Buffer for Experience Replay
self.buffer = MemoryBuffer(self.buffer_size, args.with_per)
def policy_action(self, s, env, epsilon=None):
""" Apply an espilon-greedy policy to pick next action
"""
if epsilon is None:
epsilon = self.epsilon
if random() <= epsilon:
valid_actions = env.valid_actions()
return r.choice(valid_actions)
else:
prediction = self.agent.predict(s.reshape((1, self.state_dim)))[0]
return np.argmax(prediction)
def train_agent(self, batch_size):
""" Train Q-network on batch sampled from the buffer
"""
# Sample experience from memory buffer (optionally with PER)
s, a, r, d, new_s, idx = self.buffer.sample_batch(batch_size)
# Apply Bellman Equation on batch samples to train our DDQN
q = self.agent.predict(s)
next_q = self.agent.predict(new_s)
q_targ = self.agent.target_predict(new_s)
for i in range(s.shape[0]):
old_q = q[i, a[i]]
if d[i]:
q[i, a[i]] = r[i]
else:
next_best_action = np.argmax(next_q[i,:])
q[i, a[i]] = r[i] + self.gamma * q_targ[i, next_best_action]
if(self.with_per):
# Update PER Sum Tree
self.buffer.update(idx[i], abs(old_q - q[i, a[i]]))
input('hi')
# Train on batch
self.agent.fit(s, q)
# Decay epsilon
def train(self, env, args, summary_writer):
""" Main DDQN Training Algorithm
"""
results = []
tqdm_e = tqdm(range(args.nb_episodes), desc='Score', leave=True, unit=" episodes")
for e in tqdm_e:
self.run_game(env)
# Reset episode
time, cumul_reward, done = 0, 0, False
old_state = env.reset()
while not done:
if args.render:
env.render()
# Actor picks an action (following the policy)
a = self.policy_action(old_state, env)
# Retrieve new state, reward, and whether the state is terminal
new_state, r, done, _ = env.step(a)
# Memorize for experience replay
self.memorize(old_state, a, r, done, new_state)
# Update current state
old_state = new_state
cumul_reward += r
time += 1
# Train DDQN and transfer weights to target network
if(self.buffer.size() > args.batch_size):
self.train_agent(args.batch_size)
self.agent.transfer_weights()
self.epsilon = self.epsilon - self.epsilon_decay
# Gather stats every episode for plotting
if(args.gather_stats):
mean, stdev = gather_stats(self, env)
results.append([e, mean, stdev])
# Export results for Tensorboard
score = tfSummary('score', cumul_reward)
summary_writer.add_summary(score, global_step=e)
summary_writer.flush()
# Display score
tqdm_e.set_description("Score: " + str(np.round(cumul_reward, 0)))
tqdm_e.refresh()
return results
def run_game(self, env):
time, cumul_reward, done = 0, 0, False
old_state = env.reset()
while not done:
# Actor picks an action (following the policy)
a = self.policy_action(old_state, env, 0.00)
# Retrieve new state, reward, and whether the state is terminal
new_state, r, done, _ = env.step(a)
# Update current state
old_state = new_state
cumul_reward += r
def memorize(self, state, action, reward, done, new_state):
""" Store experience in memory buffer
"""
if(self.with_per):
q_val = self.agent.predict(state)
q_val_t = self.agent.target_predict(new_state)
next_best_action = np.argmax(q_val)
new_val = reward + self.gamma * q_val_t[0, next_best_action]
td_error = abs(new_val - q_val)[0]
else:
td_error = 0
self.buffer.memorize(state, action, reward, done, new_state, td_error)
def save_weights(self, path):
path += '_LR_{}'.format(self.lr)
if(self.with_per):
path += '_PER'
self.agent.save(path)
def load_weights(self, path):
self.agent.load_weights(path)
import sys
import numpy as np
import keras.backend as K
from keras.optimizers import Adam
from keras.models import Model
from keras.layers import Input, Dense, Flatten, Reshape, LSTM, Lambda
from keras.regularizers import l2
from utils.networks import conv_block
class Agent:
""" Agent Class (Network) for DDQN
"""
def __init__(self, state_dim, action_dim, lr, tau, dueling):
self.state_dim = state_dim
self.action_dim = action_dim
self.tau = tau
self.dueling = dueling
# Initialize Deep Q-Network
self.model = self.network(dueling)
self.model.compile(Adam(lr), 'mse')
print(self.model.summary())
# Build target Q-Network
self.target_model = self.network(dueling)
self.target_model.compile(Adam(lr), 'mse')
self.target_model.set_weights(self.model.get_weights())
def huber_loss(self, y_true, y_pred):
return K.mean(K.sqrt(1 + K.square(y_pred - y_true)) - 1, axis=-1)
def network(self, dueling):
""" Build Deep Q-Network
"""
inp = Input(batch_shape=(None, self.state_dim))
# Determine whether we are dealing with an image input (Atari) or not
x = Dense(256, activation='relu')(inp)
x = Dense(256, activation='relu')(x)
if(dueling):
# Have the network estimate the Advantage function as an intermediate layer
x = Dense(self.action_dim + 1, activation='linear')(x)
x = Lambda(lambda i: K.expand_dims(i[:,0],-1) + i[:,1:] - K.mean(i[:,1:], keepdims=True), output_shape=(self.action_dim,))(x)
else:
x = Dense(self.action_dim, activation='linear')(x)
return Model(inp, x)
def transfer_weights(self):
""" Transfer Weights from Model to Target at rate Tau
"""
W = self.model.get_weights()
tgt_W = self.target_model.get_weights()
for i in range(len(W)):
tgt_W[i] = self.tau * W[i] + (1 - self.tau) * tgt_W[i]
self.target_model.set_weights(tgt_W)
def fit(self, inp, targ):
""" Perform one epoch of training
"""
self.model.fit(inp, targ, epochs=1, verbose=0)
def predict(self, inp):
""" Q-Value Prediction
"""
return self.model.predict(inp)
def target_predict(self, inp):
""" Q-Value Prediction (using target network)
"""
return self.target_model.predict(inp)
def reshape(self, x):
return x
def save(self, path):
if(self.dueling):
path += '_dueling'
self.model.save_weights(path + '.h5')
def load_weights(self, path):
self.model.load_weights(path)