我在pygame中构建了一个简单的交通信号灯模拟,我的目标是让代理在不同的交通信号灯模式之间进行选择,以最大程度地减少汽车的等待时间(分数是通过将每辆车的速度消耗为0来计算的车架之间)。但是,该模型无法学习预期的行为。
我尝试了几种折扣值(0.1、0.5、0.9),因为我不确定是否有必要展望未来以解决这一特殊任务。
我训练了特工约20万步,虽然它偏爱某些动作(两个动作使整个街道变成绿色,这对我来说很有意义),但似乎并没有在正确的时间选择它们,更像是在两者之间随机切换。代理商根本不在乎街上的汽车。
该代理还能够在同一环境中学习简单的策略,例如,如果其目标是最大程度地等待时间,那么只需使所有指示灯都变红即可。
from keras.optimizers import adam
import numpy as np
import tensorflow as tf
import keras
import matplotlib.pyplot as plt
#from utils import plotlearning
batch_size = 10
gamma = 0.9
epsilon = 1
epsilon_decay = 0.999
epsilon_end = 0.001
target_update = 200
memory_size = 100000
LR = 0.001
num_episodes = 1000
decisions_per_episode = 200
frames_per_step = 10
class ReplayBuffer(object):
def __init__(self, max_size, input_shape, n_actions):
self.mem_size = max_size
self.mem_cntr = 0
self.state_memory = np.zeros((self.mem_size, 106, 60, 4))
self.new_state_memory = np.zeros((self.mem_size, 106, 60, 4))
dtype = np.int8
self.action_memory = np.zeros((self.mem_size), dtype=dtype)
self.reward_memory = np.zeros(self.mem_size)
def store_transition(self, state, action, reward, state_):
index_num = int(self.mem_cntr % self.mem_size)
self.new_state_memory[index_num] = state_
self.state_memory[index_num] = state
self.reward_memory[index_num] = reward
self.action_memory[index_num] = action
self.mem_cntr +=1
def sample_buffer(self, batch_size):
max_mem = min(self.mem_cntr, self.mem_size)
batch = np.random.choice(max_mem, batch_size)
states = self.state_memory[batch]
states_ = self.new_state_memory[batch]
rewards = self.reward_memory[batch]
actions = self.action_memory[batch]
return states, actions, rewards, states_
def build_dqn(n_actions):
model = tf.keras.models.Sequential()
#convolution
model.add(tf.keras.layers.Conv2D(16, kernel_size=8, strides=(4, 4), activation="relu", input_shape=(106,60,4)))
model.add(tf.keras.layers.Conv2D(32, kernel_size=4, strides=(2, 2), activation="relu"))
#flatten
model.add(tf.keras.layers.Flatten())
#dense
model.add(tf.keras.layers.Dense(256, activation=tf.nn.relu))
#output
model.add(tf.keras.layers.Dense(n_actions))
model.compile(loss='mse',
optimizer="adam")
return model
class Agent(object):
def __init__(self, n_actions, input_dims, eps, eps_dec, eps_end, mem_size=1000, fname='dqn_model_positive_reward_0.5gamma.h5'):
self.action_space = [i for i in range(n_actions)]
self.n_actions = n_actions
self.model_file = fname
self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
self.q_eval = build_dqn(n_actions)
self.target_model = build_dqn(n_actions)
self.epsilon = eps
self.epsilon_dec = eps_dec
self.epsilon_end = eps_end
def remember(self, state, action, reward, new_state):
self.memory.store_transition(state, action, reward, new_state)
def choose_action(self, state):
state = state
rand = np.random.random()
if rand < self.epsilon:
action = np.random.choice(self.action_space)
else:
output = self.q_eval.predict(state)
print(output)
action = np.where(output == np.amax(output))
action = action[1][0]
print(action)
return action
def copy_nn(self):
self.target_model.set_weights(self.q_eval.get_weights())
def learn(self):
if self.memory.mem_cntr < batch_size:
return
state, action, reward, new_state = self.memory.sample_buffer(batch_size)
q_eval = self.q_eval.predict(state)
q_next = self.target_model.predict(new_state)
q_target = q_eval.copy()
batch_index = np.arange(batch_size, dtype = np.int32)
q_target[batch_index, action] = reward + gamma*np.max(q_next, axis=1)
_ = self.q_eval.fit(state, q_target, verbose = 0)
self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > self.epsilon_end else self.epsilon_end
def save_model(self):
self.q_eval.save(self.model_file)
def load_model(self):
self.q_eval = load_model(self.model_file)
env = Environment()
agent = Agent(input_dims = 57600, n_actions = 7, eps = 1, eps_dec= 0.1**(1/(num_episodes*decisions_per_episode)), eps_end = 0.001)
scores = []
eps_history = []
last_action = 0
for i in range(num_episodes):
env.reset()
old_score = 0
state = env.get_state(random.randint(0,6))
decisions = 0
env.spawn_delay = random.randint(20,200)
if i % target_update == 0:
agent.copy_nn()
#refill active_spawns list
env.active_spawns =[]
for element in range(21):
env.active_spawns.append(random.randint(0,7))
while decisions<decisions_per_episode:
decisions+=1
action = agent.choose_action(state)
for _ in range(frames_per_step):
env.step(action)
old_action = action
if len(env.all_cars) != 0:
reward = env.score - old_score
else:
reward = 0
state_ = env.get_state(old_action)
agent.remember(state, action, reward, state_)
state = state_
old_score = env.score
agent.learn()
eps_history.append(epsilon)
scores.append(env.score)
if i % 10 == 0 and i > 0:
agent.save_model()
pygame.quit()
这是训练模型的部分。我将环境部分忽略了,因为它不应该很重要。如果需要这部分,请告诉我。