强化学习-不收敛

时间:2019-09-09 07:18:26

标签: python keras reinforcement-learning

我在pygame中构建了一个简单的交通信号灯模拟,我的目标是让代理在不同的交通信号灯模式之间进行选择,以最大程度地减少汽车的等待时间(分数是通过将每辆车的速度消耗为0来计算的车架之间)。但是,该模型无法学习预期的行为。

我尝试了几种折扣值(0.1、0.5、0.9),因为我不确定是否有必要展望未来以解决这一特殊任务。

我训练了特工约20万步,虽然它偏爱某些动作(两个动作使整个街道变成绿色,这对我来说很有意义),但似乎并没有在正确的时间选择它们,更像是在两者之间随机切换。代理商根本不在乎街上的汽车。

该代理还能够在同一环境中学习简单的策略,例如,如果其目标是最大程度地等待时间,那么只需使所有指示灯都变红即可。


from keras.optimizers import adam
import numpy as np
import tensorflow as tf
import keras
import matplotlib.pyplot as plt
#from utils import plotlearning

batch_size = 10
gamma = 0.9

epsilon = 1
epsilon_decay = 0.999
epsilon_end = 0.001
target_update = 200
memory_size = 100000
LR = 0.001

num_episodes = 1000
decisions_per_episode = 200
frames_per_step = 10

class ReplayBuffer(object):
    def __init__(self, max_size, input_shape, n_actions):
        self.mem_size = max_size
        self.mem_cntr = 0
        self.state_memory = np.zeros((self.mem_size, 106, 60, 4))
        self.new_state_memory = np.zeros((self.mem_size, 106, 60, 4))
        dtype = np.int8
        self.action_memory = np.zeros((self.mem_size), dtype=dtype)
        self.reward_memory = np.zeros(self.mem_size)

    def store_transition(self, state, action, reward, state_):
        index_num = int(self.mem_cntr % self.mem_size)
        self.new_state_memory[index_num] = state_
        self.state_memory[index_num] = state
        self.reward_memory[index_num] = reward
        self.action_memory[index_num] = action
        self.mem_cntr +=1

    def sample_buffer(self, batch_size):
        max_mem = min(self.mem_cntr, self.mem_size)
        batch = np.random.choice(max_mem, batch_size)

        states = self.state_memory[batch]
        states_ = self.new_state_memory[batch]
        rewards = self.reward_memory[batch]
        actions = self.action_memory[batch]

        return states, actions, rewards, states_


def build_dqn(n_actions):
    model = tf.keras.models.Sequential()


    #convolution
    model.add(tf.keras.layers.Conv2D(16, kernel_size=8, strides=(4, 4), activation="relu", input_shape=(106,60,4)))    
    model.add(tf.keras.layers.Conv2D(32, kernel_size=4, strides=(2, 2), activation="relu"))

    #flatten
    model.add(tf.keras.layers.Flatten())

    #dense
    model.add(tf.keras.layers.Dense(256, activation=tf.nn.relu))

    #output
    model.add(tf.keras.layers.Dense(n_actions))



    model.compile(loss='mse',
              optimizer="adam")

    return model








class Agent(object):
    def __init__(self, n_actions, input_dims, eps, eps_dec, eps_end, mem_size=1000, fname='dqn_model_positive_reward_0.5gamma.h5'):
        self.action_space = [i for i in range(n_actions)]
        self.n_actions = n_actions
        self.model_file = fname

        self.memory = ReplayBuffer(mem_size, input_dims, n_actions)
        self.q_eval = build_dqn(n_actions)
        self.target_model = build_dqn(n_actions)
        self.epsilon = eps
        self.epsilon_dec = eps_dec
        self.epsilon_end = eps_end


    def remember(self, state, action, reward, new_state):
        self.memory.store_transition(state, action, reward, new_state)

    def choose_action(self, state):
        state = state
        rand = np.random.random()

        if rand < self.epsilon:
            action = np.random.choice(self.action_space)
        else:
            output = self.q_eval.predict(state)
            print(output)
            action = np.where(output == np.amax(output))
            action = action[1][0]
            print(action)
        return action
    def copy_nn(self):
        self.target_model.set_weights(self.q_eval.get_weights()) 

    def learn(self):

        if self.memory.mem_cntr < batch_size:
            return

        state, action, reward, new_state = self.memory.sample_buffer(batch_size)

        q_eval = self.q_eval.predict(state)
        q_next = self.target_model.predict(new_state)

        q_target = q_eval.copy()

        batch_index = np.arange(batch_size, dtype = np.int32)

        q_target[batch_index, action] = reward + gamma*np.max(q_next, axis=1)

        _ = self.q_eval.fit(state, q_target, verbose = 0)

        self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > self.epsilon_end else self.epsilon_end

    def save_model(self):
        self.q_eval.save(self.model_file)

    def load_model(self):
        self.q_eval = load_model(self.model_file)


env = Environment()



agent = Agent(input_dims = 57600, n_actions = 7, eps = 1, eps_dec= 0.1**(1/(num_episodes*decisions_per_episode)), eps_end = 0.001)

scores = []
eps_history = []
last_action = 0



for i in range(num_episodes):
    env.reset()
    old_score = 0
    state = env.get_state(random.randint(0,6))
    decisions = 0
    env.spawn_delay = random.randint(20,200)

    if i % target_update == 0:    
        agent.copy_nn()

    #refill active_spawns list
    env.active_spawns =[]
    for element in range(21):
        env.active_spawns.append(random.randint(0,7))



    while decisions<decisions_per_episode:
        decisions+=1
        action = agent.choose_action(state)


        for _ in range(frames_per_step):

            env.step(action)

        old_action = action

        if len(env.all_cars) != 0:
            reward = env.score - old_score
        else:
            reward = 0


        state_ = env.get_state(old_action)
        agent.remember(state, action, reward, state_)
        state = state_
        old_score = env.score
        agent.learn()
    eps_history.append(epsilon)
    scores.append(env.score)



    if i % 10 == 0 and i > 0:
        agent.save_model()



pygame.quit()

这是训练模型的部分。我将环境部分忽略了,因为它不应该很重要。如果需要这部分,请告诉我。

0 个答案:

没有答案