Python Deep Q Network Snake无法学习

时间:2019-12-03 17:04:46

标签: python deep-learning q-learning dqn

所以我想将this snake game 与在this DQN中找到的this article结合起来。 首先,我尝试将NN的输入层更改为400输入。游戏的视野是20乘以20的20倍,所以我想我可以向NN发送一个2D数组,其中0表示普通的场单元,1表示零食,3表示蛇头,其余2表示身体其余部分。 。但这没有用,所以我更改了输入以匹配this article I got the DQN from之一。但这也不起作用,所以我认为可能是因为蛇没有正确学习。请注意,目前,蛇还不会因与其自身的身体碰撞而死亡,并且get_state函数不会将蛇的身体视为危险。我也曾尝试在开始时(按5乘5缩小)缩小字段(课程学习),但蛇似乎仍然没有学习。

我实施了某些错误操作,还是Agent甚至根本没有学习?

以下是其中包含完整代码的回购协议:

https://github.com/Dyzlee/DQN-Snake-unfinished

这是蛇游戏的主要功能,其中大多数DQN方法都称为:

global width, rows, s, snack
width = 500
rows = 20
highscore = 0
win = pygame.display.set_mode((width, width+100))  # Create window
s = snake(RED, (10, 10))
snack = cube(randomSnack(rows, s), color=GREEN)
agent = DQNAgent()
speed = 0  # 0 --> Fast speed ; 10 --> Normal speed

pygame.init()  # Init pygame
clock = pygame.time.Clock()

while s.numOfGame < 300:  # Only play 300 games
    clock.tick(speed)  # Delay for speed

    agent.epsilon = 80 - s.numOfGame  # Set epsilon to a high value at beginning that becomes less and less

    state_old = agent.get_state(snack, s)  # Get the state BEFORE the move is made

    if random.randint(0, 200) < agent.epsilon:
        # Explore
        finale_move = random.randint(0, 3)  # Random move 1: Left 2: Up 3: Right 4: Down
        # print('Explore')
    else:
        # Exploitation
        prediction = agent.model.predict(state_old.reshape((1, 11)))  # Get action for given state from NN
        finale_move = np.argmax(prediction[0])  # Predicted move
        # print('Predicted move:', finale_move)
        # print('Exploit')

    s.move(finale_move)  # Execute final_move
    state_new = agent.get_state(snack, s)  # Get the state AFTER the move is made

    appleWasEaten = False  # Bool for reward
    if s.body[0].pos == snack.pos:  # If snake eats an apple
        s.addCube()
        appleWasEaten = True  # If an apple was eaten set to true
        snack = cube(randomSnack(rows, s), color=GREEN)

    '''for x in range(len(s.body)):
        if s.body[x].pos in list(map(lambda z: z.pos, s.body[x + 1:])):  # If snake bites its own tail
            s.die()
            break'''

    reward = agent.set_reward(s.isDead, appleWasEaten)  # Set reward for the new state following the action
    #print(reward)

    agent.train_short_memory(state_old, finale_move, reward, state_new, s.isDead)  # Train short memory with new action and state

    agent.remember(state_old, finale_move, reward, state_new, s.isDead)  # Save the new data in long term memory

    if s.isDead:  # If the die() function was called, isDead is True and then the game is reset
        agent.replay_new(agent.memory)  # Fit the neural network
        s.reset((10, 10))
        initialize_game(agent, appleWasEaten)

    highscore = getHighscore(highscore)  # Set highscore
    redrawWindow(win, highscore)

agent.model.save_weights('weights.hdf5')

这是整个DQN类:

from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
import random
import numpy as np
import pandas as pd
from operator import add

class DQNAgent(object):

    def __init__(self):
        self.reward = 0
        self.gamma = 0.9
        self.dataframe = pd.DataFrame()
        self.short_memory = np.array([])
        self.agent_target = 1
        self.agent_predict = 0
        self.learning_rate = 0.0005
        self.model = self.network()
        # self.model = self.network("weights.hdf5")
        self.epsilon = 0
        self.actual = []
        self.memory = []

    def get_state(self, snack, s):  # Get state for the NN

        new_state = [
            (s.body[0].dirnx == -1 and s.body[0].pos[0] < 1) or (s.body[0].dirnx == 1 and s.body[0].pos[0] > 18)
            or (s.body[0].dirny == 1 and s.body[0].pos[1] > 18) or (s.body[0].dirny == -1 and s.body[0].pos[1] < 1),  # Danger Straight

            (s.body[0].dirnx == -1 and s.body[0].pos[1] < 1) or (s.body[0].dirnx == 1 and s.body[0].pos[1] > 18)
            or (s.body[0].dirny == 1 and s.body[0].pos[0] < 1) or (s.body[0].dirny == -1 and s.body[0].pos[0] > 18),  # Danger Right

            (s.body[0].dirnx == -1 and s.body[0].pos[1] > 18) or (s.body[0].dirnx == 1 and s.body[0].pos[1] < 1)
            or (s.body[0].dirny == 1 and s.body[0].pos[0] > 18) or (s.body[0].dirny == -1 and s.body[0].pos[0] < 1),  # Danger Left

            s.body[0].dirnx == -1,  # Move Left
            s.body[0].dirnx == 1,  # Move Right
            s.body[0].dirny == -1,  # Move Up
            s.body[0].dirny == 1,  # Move Down
            s.body[0].pos[0] > snack.pos[0],  # Snack Left
            s.body[0].pos[0] < snack.pos[0],  # Snack Right
            s.body[0].pos[1] > snack.pos[1],  # Snack Up
            s.body[0].pos[1] < snack.pos[1]  # Snack Down
        ]

        for i in range(len(new_state)):
            if new_state[i]:
                new_state[i] = 1
            else:
                new_state[i] = 0
        #print(new_state)

        return np.asarray(new_state)

    def set_reward(self, snakeIsDead, appleWasEaten):  # Sets the reward for the current state
        self.reward = 0
        if snakeIsDead:
            self.reward = -10
        if appleWasEaten:
            self.reward = 10
        return self.reward

    def network(self, weights=None):
        model = Sequential()
        model.add(Dense(output_dim=120, activation='relu', input_dim=11))
        model.add(Dropout(0.15))
        model.add(Dense(output_dim=120, activation='relu'))
        model.add(Dropout(0.15))
        model.add(Dense(output_dim=120, activation='relu'))
        model.add(Dropout(0.15))
        model.add(Dense(output_dim=4, activation='softmax'))
        opt = Adam(self.learning_rate)
        model.compile(loss='mse', optimizer=opt)

        if weights:
            model.load_weights(weights)

        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def replay_new(self, memory):
        if len(memory) > 1000:
            minibatch = random.sample(memory, 1000)
        else:
            minibatch = memory
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = reward + self.gamma * np.amax(self.model.predict(np.array([next_state]))[0])
            target_f = self.model.predict(np.array([state]))
            target_f[0][np.argmax(action)] = target
            self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)

    def train_short_memory(self, state, action, reward, next_state, done):
        target = reward
        if not done:
            target = reward + self.gamma * np.amax(self.model.predict(next_state.reshape((1, 11)))[0])
        target_f = self.model.predict(state.reshape((1, 11)))
        target_f[0][np.argmax(action)] = target
        self.model.fit(state.reshape((1, 11)), target_f, epochs=1, verbose=0) 

0 个答案:

没有答案