你好,堆栈溢出我最近尝试制作一个DQN井字游戏机器人,该机器人学会了玩井字游戏,我消除了很多我认为是错误的东西,但我的DQN仍然很混乱,学会了基本杀死游戏一开始,他自己。我决定允许他做出井字游戏中非法的举动,但这样做会带来巨大的负面奖励。尽管获得了这种奖励,他似乎只是想立即死亡。
我尽了最大的努力来评论它,但是我的一些DQN代码是从一个教程中获得的,我不完全知道它是如何工作的,但是我评论了它是如何工作的。我知道这有点远,我已经尝试过从其他来源获得帮助。如果您真的想帮助一个兄弟,可以在以下地点与我联系:http:// bucket of Crabs#1000,以便我可以向您解释代码并赶上您并希望帮助您进行诊断,我知道这对一个随机的陌生人来说是一个巨大的承诺,但是感谢所有为我提供帮助的努力,并认识到这是一笔巨大的时间投入,用于尝试修复我到底出了错的地方,因此,我非常感谢您到目前为止已经阅读了本书。如果您有任何资源可以帮助我了解为什么它自己出错,我也将不胜感激。有关此Reddit帖子的更多信息,请点击此处:https://old.reddit.com/r/learnmachinelearning/comments/e9t4i7/help_me_fix_my_tictactoe_dqn/
完整的代码如下:
import tensorflow as tf
import random
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from collections import deque
import numpy as np
import warnings
import os
class DQN:
def __init__(self):
self.memory = deque(maxlen=2000)
# I don't really know what these paramaters mean if i'm being honest was given in tutorial
self.gamma = 0.95
self.epsilon = 1
self.epsilon_min = 0.01
self.epsilon_decay = 0.995
self.learning_rate = 0.2
tf.keras.backend.clear_session()
self.model = self.create_model()
self.model.summary()
# "hack" implemented by DeepMind to improve convergence
self.target_model = self.create_model()
def create_model(self):
#My model
model = Sequential()
model.add(Dense(9, input_dim=1,
activation="relu"))
#1 hidden layer chosen because I read most the time you don't need more lmao, I chose 9 perceptrons because I read that a good number to pick
#is the mean of the input and outputlayers
#model.add(Dense(9, activation="relu"))
model.add(Dense(9,activation="linear"))
model.compile(loss="mean_squared_error",
optimizer=Adam(lr=self.learning_rate))
return model
def remember(self, state, action, reward, new_state, done):
self.memory.append([state, action, reward, new_state, done])
#this code is basically directly taken from the tutorials code
def replay(self):
#batch number was lowered because training was taking a million years
batch_size = 9
if len(self.memory) < 9:
return
samples = random.sample(self.memory, batch_size)
for sample in samples:
#We have a sample
state, action, reward, new_state, done = sample
#We make a prediction based on the sample
target = self.target_model.predict(state,batch_size=10)
#If we lost due to that action we set the reward of that action equal to the reward
if done:
target[0][action] = reward
else: # else we set the reward of the action based on the max predicted reward of the following step
Q_future = max(
self.target_model.predict(new_state,batch_size=1)[0])
target[0][action] = reward + Q_future * self.gamma
#
self.model.fit(state, target, epochs=1, verbose=0)
def target_train(self):
weights = self.model.get_weights()
target_weights = self.target_model.get_weights()
for i in range(len(target_weights)):
target_weights[i] = weights[i]
self.target_model.set_weights(target_weights)
def act(self, state):
self.epsilon *= self.epsilon_decay
self.epsilon = max(self.epsilon_min, self.epsilon)
if np.random.random() < self.epsilon:
#Originally I had it just do any random number, but I figured it might be good to have the ai have guranteed okayish date to remember
#It used to just be random
rntry = random.sample([0,1,2,3,4,5,6,7,8],1)[0]
while(state[rntry]!=0):
rntry = random.sample([0,1,2,3,4,5,6,7,8],1)[0]
return rntry
return np.argmax(self.model.predict(state,batch_size=10)[0])
转轮/井字
import ctypes
hllDll = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\cudart64_101.dll")
hlldll2 = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\cublasLt64_10.dll")
hlldll2 = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\cublas64_10.dll")
hlldll3 = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\cufft64_10.dll")
hlldll4 = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\curand64_10.dll")
hlldll5 = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\cusolver64_10.dll")
hlldll6 = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\cusparse64_10.dll")
from collections import deque
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
import random
from tensorflow.keras.layers import Flatten
from NeuralNet import *
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
def flatten(state):
ret = []
for col in state:
for ele in col:
ret.append(ele)
return ret
# x = 1, o = 2
#Test's game over states for the tic-tac-toe game
def gameOver(game):
for n in game[0]: #straight down
if(game[0][n] == game[1][n] and game[0][n] == game[2][n] and game[0][n]!=0):
return game[0][n]
if(game[0][0] == game[1][1] and game[0][0] == game[2][2] and game[0][0] != 0): #diagonal top left
return game[0][0]
if(game[2][0] == game[1][1] and game[2][0] == game[0][2] and game[2][0] != 0): #diagonal top right
return game[2][0]
for n in game: # straight across
if(n[0] == n[1] and n[0] == n[2] and n[0] != 0):
return n[0]
tie = 0
for n in game:
for m in n:
if(n[m] != 0):
tie += 1
if(tie == 9):
return 3
#our model of tic tac toe that takes the state a move and a couple of parameters so we know who is playing
def play(curState,move, turn,ran):
if not turn:
#This is what the algorithm competes against, an algorithm that just picks random valid moves
move = random.sample([0,1,2,3,4,5,6,7,8],1)[0]
while curState[move%3][move//3]!=0 :
move = random.sample([0,1,2,3,4,5,6,7,8],1)[0]
if not ran:
#used to play against the bot during/at the end of training
print(str(curState[0]) + "\n" + str(curState[1]) + "\n" + str(curState[2]))
move = int(input("1-9: "))-1
value = [(0,0),(0,1),(0,2),(1,0),(1,1),(1,2),(2,0),(2,1),(2,2)]
coord = value[move]
x = coord[0] # row
y = coord[1] # column
if curState[x][y] != 0: # checks if the coordinate input has already been replaced
if not ran:
return (curState, True, -1)
else:
return (curState,True,1)
if turn == True: # alternates the turn and changes the index
curState[x][y] = 1
else:
curState[x][y] = 2
# prints the game board
if gameOver(curState) != None:
return (curState,True,gameOver(curState))
else:
return (curState,False,0)
def main():
#Trials
trials = 100000
#this was int the original
trial_len = 12
reward = 0
updateTargetNetwork = 1000
dqn_agent = DQN()
steps = []
#Iterate over the number of trials
for trial in range(trials):
#Allows me to take control of the AI competing against the DQN toward the end of the trials
if trial < trials*.9:
random = True
else:
random = False
cur_state = [[0,0,0],[0,0,0],[0,0,0]]
#Basically I while true loop since trial_len > then the total number of turns in a tic-tac-toe game
for step in range(trial_len):
#Default reward, ie the it made a valid move, that didn't end the game
reward = 2
#We get our action from the dqn
action = dqn_agent.act(np.asarray(flatten(cur_state)))
if not random:
print("Move:" + str(action+1))
#We feed the cur state and the action into the tic-tac-toe game
new_state,done,outcome = play(cur_state,action,True,False)
#I save the currentstate because I need to feed it into the DQN's remember function later
#Actually not using it right now, it occured to me that when replay was making predictions based on the model
#It would be the model that wouldn't see the response from the opponent, as a result it would probably think some moves are valid that aren't
#Ie it would think that playing on top of the enemy in some cases would be good because it hadn't even seen the opponent had already played there
bot_state = cur_state
#Not sure if this makes sense but before I update the DQN I play the opponents move, that way if the bot did something that
#led directly to it losing we can remember that.
if not done:
cur_state,done,_ = play(cur_state,-1,False,random)
#Decides the reward based on the outcome of the tic-tac-toe turn
#Only does this if the game is over because we are assigning some reward just for valid moves so in those cases we leave it to the default
if done:
if outcome == 1:
reward=10
elif outcome==3:
reward=10
elif outcome == -1:
reward = -1000
else:
reward = -10
#We remember the DQN's action and states
dqn_agent.remember(np.asarray(flatten(bot_state)), action,
reward, np.asarray(flatten(cur_state)), done)
#We learn
dqn_agent.replay()
#We copy the current models weights over to the target
dqn_agent.target_train()
print(reward)
if done:
break
main()
再次感谢您度过的任何时间,我非常感谢。