Question

我的名字叫Andy，我是Stackoverflow的新手，这是我的第一个问题。

40年前，由于covid19，我开始学习python，大约3周前就进入了机器学习/ qlearning领域，并从那时开始就陷入困境。

目标： 让计算机通过强化学习来玩Rad Racer 2（NES赛车游戏）。

计划完成这项工作：

经过各种教程/站点，我决定使用双重网络进行培训/学习。自从我看了一些关于keras basic的教程视频以来，使用keras的2x 256卷积网络

3个动作（按住加速（J），加速向左（JA），加速向右（JD）我使用网上找到的DirectInput键代码将输入发送到游戏，因为发送常规键不起作用。

我知道ppl在这类游戏中使用复古健身房，但我想了解奖励/观察的内部工作原理，因此我使用yolov5来检测线条/物体。根据yolov5的结果，我计算出该步骤的报酬。

我的输入是一系列灰度图像（4），它们使用双端队列表示运动，然后与numpy堆叠在一起。一旦我收集了足够的经验/重放记忆（1500），我便在每一集的结尾而不是每一步开始训练。我发现，每步操作后，它都落后于很多训练。

问题：

我目前最大的问题是该模型似乎无法正常学习。在第20-30集之前，我似乎还不错，然后又变得越来越糟。到了只能做一个小时的动作的地步。

我尝试过学习率（0.1-0.00001），不同的输入（1 bgr层，灰度层，4层等），不同的epsilon衰减率。我评论了大多数奖励物品，现在只评论了基本奖励。

yolo东西旁边的大多数代码由于＃个字符的限制而不得不删除几行

# parameters
training = True
learning_rate = 0.0001
DISCOUNT = 0.99
REPLAY_MEMORY_SIZE = 50_000  # How many last steps to keep for model training
MIN_REPLAY_MEMORY_SIZE = 1500  # Minimum number of steps in a memory to start training
MINIBATCH_SIZE = 1000  # How many steps (samples) to use for training
batch_size = 32
UPDATE_TARGET_EVERY = 0  # Terminal states (end of episodes)
MODEL_NAME = 'RC'
MIN_REWARD = 0  # For model save
save_every = 5  # save every x episodes

EPISODES = 2_000

# Exploration settings
if training is True:
    epsilon = 1  # not a constant, going to be decayed
else:
    epsilon = 0 

MIN_EPSILON = 0.01
START_EPISODE_DECAY = 0
END_EPISODE_DECAY = 20
if epsilon > MIN_EPSILON:
    EPS_DECAY = -(epsilon/((END_EPISODE_DECAY-START_EPISODE_DECAY)/epsilon))
else:
    EPS_DECAY = 0
  

# Agent class
class DQNAgent:

    def __init__(self):

        # Main model
        self.model = self.create_model()
        # self.model = self.load_model()

        # Target network
        self.target_model = self.create_model()
        self.target_model.set_weights(self.model.get_weights())

        # An array with last n steps for training
        self.replay_memory = deque(maxlen=REPLAY_MEMORY_SIZE)

        # Used to count when to update target network with main network's weights
        self.target_update_counter = 0

    def create_model(self):

        dropout = 0.1

        model = Sequential()

        model.add(Conv2D(256, (2, 2), input_shape=(int(height/resize_ratio), int(width/resize_ratio), img_channels)))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(dropout))

        model.add(Conv2D(256, (2, 2)))
        model.add(Activation('relu'))
        model.add(MaxPooling2D(pool_size=(2, 2)))
        model.add(Dropout(dropout))

        model.add(Flatten())
        model.add(Dense(64))

        model.add(Dense(env.ACTION_SPACE_SIZE, activation='linear'))  # ACTION_SPACE_SIZE = how many choices (9)
        model.compile(loss="mse", optimizer=Adam(lr=learning_rate), metrics=['accuracy'])

        return model
 
    # Trains main network at end of episode
    def train(self, terminal_state):
        # Start training only if certain number of samples is already saved
        if len(self.replay_memory) < MIN_REPLAY_MEMORY_SIZE:
            return

        minibatch = random.sample(self.replay_memory, MINIBATCH_SIZE)

        current_states = np.array([transition[0] for transition in minibatch])
        # from (MINIBATCH_SIZE, 1, h, w, 4) > (MINIBATCH_SIZE, h, w, 4)
        current_states = current_states.reshape(current_states.shape[0], current_states.shape[2],
                                                current_states.shape[3], current_states.shape[4])
        current_qs_list = self.model.predict(current_states)

        new_current_states = np.array([transition[3] for transition in minibatch])
        new_current_states = new_current_states.reshape(new_current_states.shape[0], new_current_states.shape[2],
                                                        new_current_states.shape[3], new_current_states.shape[4])
        # new_current_states = np.expand_dims(new_current_states, axis=-1)
        future_qs_list = self.target_model.predict(new_current_states)

        X = []
        y = []

        for index, (current_state_img, current_action, current_reward, new_current_img, current_done) in enumerate(minibatch):

            if not current_done:
                max_future_q = np.max(future_qs_list[index])
                new_q = current_reward + (DISCOUNT * max_future_q)
            else:
                new_q = 0.0

            current_qs = current_qs_list[index]

            current_qs[current_action] = new_q

            X.append(np.squeeze(current_state_img, axis=0))
            y.append(current_qs)

        X = np.array(X)
        # X = np.expand_dims(X, axis=-1)
        # X = X.reshape(X.shape[0], X.shape[2], X.shape[3], X.shape[4])
        y = np.array(y)

        self.model.fit(X, y, batch_size=batch_size, verbose=0, shuffle=False)
        # self.model.train_on_batch(X, y)
        if terminal_state:
            self.target_update_counter += 1

        # If counter reaches set value, update target network with weights of main network
        if self.target_update_counter > UPDATE_TARGET_EVERY:
            self.target_model.set_weights(self.model.get_weights())
            self.target_update_counter = 0
            print('target_model trained!')

    # Queries main network for Q values given current observation space (environment state)
    def get_qs(self, state):
        result = agent.model.predict(state)
        result = result[0]
        return result


agent = DQNAgent()

current_img_stack = deque(maxlen=4)

# make the game active
game = gw.getWindowsWithTitle('Mesen')[0]
game.activate()
time.sleep(1)
release_all()

# Iterate over episodes
for episode in tqdm(range(1, EPISODES + 1), ascii=True, unit='episodes'):

    episode_reward = 0
    step = 1

    if episode <= START_EPISODE_DECAY - 1:
        start_epsilon = False
    elif episode >= END_EPISODE_DECAY + 1:
        start_epsilon = False
    else:
        start_epsilon = True

    # Reset environment and get initial state
    # blackscreens followed by the 1st screen starting out
    current_state = env.reset()
    blackscreen = np.zeros_like(current_state)

    current_img_stack.append(blackscreen)
    current_img_stack.append(blackscreen)
    current_img_stack.append(blackscreen)
    current_img_stack.append(current_state)
    stacked_state = np.stack(current_img_stack, axis=2)
    stacked_state = np.ascontiguousarray(stacked_state, dtype=np.float32) / 255
    stacked_state = np.transpose(stacked_state, (1, 0, 2))
    stacked_state = np.expand_dims(stacked_state, axis=0)

    start_time = time.time()
    # Reset flag and start iterating until episode ends
    done = False

    while not done:

        if np.random.random() > epsilon:
            action = np.argmax(agent.get_qs(stacked_state))
        else:
            action = np.random.randint(0, env.ACTION_SPACE_SIZE)

        new_state, reward, done, prediction, preview = env.step(action)

        if done is False:
            next_img_stack = current_img_stack
            next_img_stack.append(new_state)
            next_stack = np.stack(next_img_stack, axis=2)
            next_stack = np.ascontiguousarray(next_stack, dtype=np.float32) / 255
            next_stack = np.transpose(next_stack, (1, 0, 2))
            next_stack = np.expand_dims(next_stack, axis=0)

            # current_state = new_state
            current_img_stack = next_img_stack
            stacked_state = next_stack

        else:
            next_img_stack = current_img_stack
            next_img_stack.append(blackscreen)
            next_stack = np.stack(next_img_stack, axis=2)
            next_stack = np.ascontiguousarray(next_stack, dtype=np.float32) / 255
            next_stack = np.transpose(next_stack, (1, 0, 2))
            next_stack = np.expand_dims(next_stack, axis=0)


        step += 1
        episode_reward += reward
        ep_rewards.append(episode_reward)

        if SHOW_PREVIEW:
            env.render(preview, prediction)

        if training is True:
            agent.update_replay_memory((stacked_state, action, reward, next_stack, done))

        # print(episode_reward)
        if done is True:
            ep_reward_final.append(episode_reward)
            print('  Epsilon(' + str(epsilon) + ') EPtimes(' + str(time.time() - start_time) + ') done('
                  + str(done) + ') step(' + str(step) + ') EPreward(' + str(episode_reward) +
                  ') best_reward_this_session(' + str(max(ep_reward_final)) + ') fps(' +
                  str(step/(time.time() - start_time)) + ')')
            # plot(ep_reward_final)
            if training is True:
                agent.train(done)

    # Decay epsilon
    if show_info is False and epsilon <= MIN_EPSILON:
        print(f"\nEPS_DECAY ended on episode {episode} - epsilon {epsilon}")
        epsilon = MIN_EPSILON
        show_info = True
    elif start_epsilon is True:
        epsilon += EPS_DECAY

喀拉拉邦不学习强化学习的问题

0 个答案: