所以我想将this snake game 与在this DQN中找到的this article结合起来。
首先,我尝试将NN的输入层更改为400输入。游戏的视野是20乘以20的20倍,所以我想我可以向NN发送一个2D数组,其中0表示普通的场单元,1表示零食,3表示蛇头,其余2表示身体其余部分。 。但这没有用,所以我更改了输入以匹配this article I got the DQN from之一。但这也不起作用,所以我认为可能是因为蛇没有正确学习。请注意,目前,蛇还不会因与其自身的身体碰撞而死亡,并且get_state
函数不会将蛇的身体视为危险。我也曾尝试在开始时(按5乘5缩小)缩小字段(课程学习),但蛇似乎仍然没有学习。
我实施了某些错误操作,还是Agent甚至根本没有学习?
以下是其中包含完整代码的回购协议:
https://github.com/Dyzlee/DQN-Snake-unfinished
这是蛇游戏的主要功能,其中大多数DQN方法都称为:
global width, rows, s, snack
width = 500
rows = 20
highscore = 0
win = pygame.display.set_mode((width, width+100)) # Create window
s = snake(RED, (10, 10))
snack = cube(randomSnack(rows, s), color=GREEN)
agent = DQNAgent()
speed = 0 # 0 --> Fast speed ; 10 --> Normal speed
pygame.init() # Init pygame
clock = pygame.time.Clock()
while s.numOfGame < 300: # Only play 300 games
clock.tick(speed) # Delay for speed
agent.epsilon = 80 - s.numOfGame # Set epsilon to a high value at beginning that becomes less and less
state_old = agent.get_state(snack, s) # Get the state BEFORE the move is made
if random.randint(0, 200) < agent.epsilon:
# Explore
finale_move = random.randint(0, 3) # Random move 1: Left 2: Up 3: Right 4: Down
# print('Explore')
else:
# Exploitation
prediction = agent.model.predict(state_old.reshape((1, 11))) # Get action for given state from NN
finale_move = np.argmax(prediction[0]) # Predicted move
# print('Predicted move:', finale_move)
# print('Exploit')
s.move(finale_move) # Execute final_move
state_new = agent.get_state(snack, s) # Get the state AFTER the move is made
appleWasEaten = False # Bool for reward
if s.body[0].pos == snack.pos: # If snake eats an apple
s.addCube()
appleWasEaten = True # If an apple was eaten set to true
snack = cube(randomSnack(rows, s), color=GREEN)
'''for x in range(len(s.body)):
if s.body[x].pos in list(map(lambda z: z.pos, s.body[x + 1:])): # If snake bites its own tail
s.die()
break'''
reward = agent.set_reward(s.isDead, appleWasEaten) # Set reward for the new state following the action
#print(reward)
agent.train_short_memory(state_old, finale_move, reward, state_new, s.isDead) # Train short memory with new action and state
agent.remember(state_old, finale_move, reward, state_new, s.isDead) # Save the new data in long term memory
if s.isDead: # If the die() function was called, isDead is True and then the game is reset
agent.replay_new(agent.memory) # Fit the neural network
s.reset((10, 10))
initialize_game(agent, appleWasEaten)
highscore = getHighscore(highscore) # Set highscore
redrawWindow(win, highscore)
agent.model.save_weights('weights.hdf5')
这是整个DQN类:
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Flatten
import random
import numpy as np
import pandas as pd
from operator import add
class DQNAgent(object):
def __init__(self):
self.reward = 0
self.gamma = 0.9
self.dataframe = pd.DataFrame()
self.short_memory = np.array([])
self.agent_target = 1
self.agent_predict = 0
self.learning_rate = 0.0005
self.model = self.network()
# self.model = self.network("weights.hdf5")
self.epsilon = 0
self.actual = []
self.memory = []
def get_state(self, snack, s): # Get state for the NN
new_state = [
(s.body[0].dirnx == -1 and s.body[0].pos[0] < 1) or (s.body[0].dirnx == 1 and s.body[0].pos[0] > 18)
or (s.body[0].dirny == 1 and s.body[0].pos[1] > 18) or (s.body[0].dirny == -1 and s.body[0].pos[1] < 1), # Danger Straight
(s.body[0].dirnx == -1 and s.body[0].pos[1] < 1) or (s.body[0].dirnx == 1 and s.body[0].pos[1] > 18)
or (s.body[0].dirny == 1 and s.body[0].pos[0] < 1) or (s.body[0].dirny == -1 and s.body[0].pos[0] > 18), # Danger Right
(s.body[0].dirnx == -1 and s.body[0].pos[1] > 18) or (s.body[0].dirnx == 1 and s.body[0].pos[1] < 1)
or (s.body[0].dirny == 1 and s.body[0].pos[0] > 18) or (s.body[0].dirny == -1 and s.body[0].pos[0] < 1), # Danger Left
s.body[0].dirnx == -1, # Move Left
s.body[0].dirnx == 1, # Move Right
s.body[0].dirny == -1, # Move Up
s.body[0].dirny == 1, # Move Down
s.body[0].pos[0] > snack.pos[0], # Snack Left
s.body[0].pos[0] < snack.pos[0], # Snack Right
s.body[0].pos[1] > snack.pos[1], # Snack Up
s.body[0].pos[1] < snack.pos[1] # Snack Down
]
for i in range(len(new_state)):
if new_state[i]:
new_state[i] = 1
else:
new_state[i] = 0
#print(new_state)
return np.asarray(new_state)
def set_reward(self, snakeIsDead, appleWasEaten): # Sets the reward for the current state
self.reward = 0
if snakeIsDead:
self.reward = -10
if appleWasEaten:
self.reward = 10
return self.reward
def network(self, weights=None):
model = Sequential()
model.add(Dense(output_dim=120, activation='relu', input_dim=11))
model.add(Dropout(0.15))
model.add(Dense(output_dim=120, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(output_dim=120, activation='relu'))
model.add(Dropout(0.15))
model.add(Dense(output_dim=4, activation='softmax'))
opt = Adam(self.learning_rate)
model.compile(loss='mse', optimizer=opt)
if weights:
model.load_weights(weights)
return model
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def replay_new(self, memory):
if len(memory) > 1000:
minibatch = random.sample(memory, 1000)
else:
minibatch = memory
for state, action, reward, next_state, done in minibatch:
target = reward
if not done:
target = reward + self.gamma * np.amax(self.model.predict(np.array([next_state]))[0])
target_f = self.model.predict(np.array([state]))
target_f[0][np.argmax(action)] = target
self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)
def train_short_memory(self, state, action, reward, next_state, done):
target = reward
if not done:
target = reward + self.gamma * np.amax(self.model.predict(next_state.reshape((1, 11)))[0])
target_f = self.model.predict(state.reshape((1, 11)))
target_f[0][np.argmax(action)] = target
self.model.fit(state.reshape((1, 11)), target_f, epochs=1, verbose=0)