有人可以帮我弄清楚为什么我的DQN学习错误的选择吗?

时间:2019-12-14 13:35:23

标签: python tensorflow machine-learning keras reinforcement-learning

你好,堆栈溢出我最近尝试制作一个DQN井字游戏机器人,该机器人学会了玩井字游戏,我消除了很多我认为是错误的东西,但我的DQN仍然很混乱,学会了基本杀死游戏一开始,他自己。我决定允许他做出井字游戏中非法的举动,但这样做会带来巨大的负面奖励。尽管获得了这种奖励,他似乎只是想立即死亡。

我尽了最大的努力来评论它,但是我的一些DQN代码是从一个教程中获得的,我不完全知道它是如何工作的,但是我评论了它是如何工作的。我知道这有点远,我已经尝试过从其他来源获得帮助。如果您真的想帮助一个兄弟,可以在以下地点与我联系:http:// bucket of Crabs#1000,以便我可以向您解释代码并赶上您并希望帮助您进行诊断,我知道这对一个随机的陌生人来说是一个巨大的承诺,但是感谢所有为我提供帮助的努力,并认识到这是一笔巨大的时间投入,用于尝试修复我到底出了错的地方,因此,我非常感谢您到目前为止已经阅读了本书。如果您有任何资源可以帮助我了解为什么它自己出错,我也将不胜感激。有关此Reddit帖子的更多信息,请点击此处:https://old.reddit.com/r/learnmachinelearning/comments/e9t4i7/help_me_fix_my_tictactoe_dqn/

完整的代码如下:

import tensorflow as tf
import random
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation, Flatten
from collections import deque
import numpy as np
import warnings
import os




class DQN:
    def __init__(self):
        self.memory  = deque(maxlen=2000)
        # I don't really know what these paramaters mean if i'm being honest was given in tutorial
        self.gamma = 0.95
        self.epsilon = 1
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.2
        tf.keras.backend.clear_session()
        self.model = self.create_model()

        self.model.summary()
        # "hack" implemented by DeepMind to improve convergence
        self.target_model = self.create_model()
    def create_model(self):
        #My model
        model   = Sequential()
        model.add(Dense(9, input_dim=1, 
            activation="relu"))
        #1 hidden layer chosen because I read most the time you don't need more lmao, I chose 9 perceptrons because I read that a good number to pick
        #is the mean of the input and outputlayers
        #model.add(Dense(9, activation="relu"))
        model.add(Dense(9,activation="linear"))
        model.compile(loss="mean_squared_error",
            optimizer=Adam(lr=self.learning_rate))
        return model
    def remember(self, state, action, reward, new_state, done):
            self.memory.append([state, action, reward, new_state, done])
    #this code is basically directly taken from the tutorials code
    def replay(self):
            #batch number was lowered because training was taking a million years
            batch_size = 9
            if len(self.memory) < 9: 
                return
            samples = random.sample(self.memory, batch_size)
            for sample in samples:
                #We have a sample
                state, action, reward, new_state, done = sample
                #We make a prediction based on the sample
                target = self.target_model.predict(state,batch_size=10)

                #If we lost due to that action we set the reward of that action equal to the reward
                if done:

                    target[0][action] = reward
                else: # else we set the reward of the action based on the max predicted reward of the following step
                    Q_future = max(
                        self.target_model.predict(new_state,batch_size=1)[0])

                    target[0][action] = reward + Q_future * self.gamma

                # 

                self.model.fit(state, target, epochs=1, verbose=0)
    def target_train(self):
            weights = self.model.get_weights()
            target_weights = self.target_model.get_weights()
            for i in range(len(target_weights)):
                target_weights[i] = weights[i]
            self.target_model.set_weights(target_weights)
    def act(self, state):
            self.epsilon *= self.epsilon_decay
            self.epsilon = max(self.epsilon_min, self.epsilon)
            if np.random.random() < self.epsilon:
                #Originally I had it just do any random number, but I figured it might be good to have the ai have guranteed okayish date to remember
                #It used to just be random
                rntry = random.sample([0,1,2,3,4,5,6,7,8],1)[0]
                while(state[rntry]!=0):
                    rntry = random.sample([0,1,2,3,4,5,6,7,8],1)[0]

                return rntry


            return np.argmax(self.model.predict(state,batch_size=10)[0])

转轮/井字

import ctypes

hllDll = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\cudart64_101.dll")
hlldll2 = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\cublasLt64_10.dll")
hlldll2 = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\cublas64_10.dll")

hlldll3 = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\cufft64_10.dll")
hlldll4 = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\curand64_10.dll")
hlldll5 = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\cusolver64_10.dll")
hlldll6 = ctypes.WinDLL("C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.2\\bin\\cusparse64_10.dll")


from collections import deque
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
import random
from tensorflow.keras.layers import Flatten
from NeuralNet import *
import os




os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
def flatten(state):
    ret = []
    for col in state:
        for ele in col:
            ret.append(ele)
    return ret


# x = 1, o = 2
#Test's game over states for the tic-tac-toe game
def gameOver(game):
    for n in game[0]: #straight down
        if(game[0][n] == game[1][n] and game[0][n] == game[2][n] and game[0][n]!=0):
            return game[0][n]
    if(game[0][0] == game[1][1] and game[0][0] == game[2][2] and game[0][0] != 0): #diagonal top left
        return game[0][0]
    if(game[2][0] == game[1][1] and game[2][0] == game[0][2] and game[2][0] != 0): #diagonal top right
        return game[2][0]
    for n in game: # straight across
        if(n[0] == n[1] and n[0] == n[2] and n[0] != 0):
            return n[0]
    tie = 0
    for n in game:
        for m in n:
            if(n[m] != 0):
                tie += 1
    if(tie == 9):
        return 3


#our model of tic tac toe that takes the state a move and a couple of parameters so we know who is playing
def play(curState,move, turn,ran):
    if not turn:
        #This is what the algorithm competes against, an algorithm that just picks random valid moves
        move = random.sample([0,1,2,3,4,5,6,7,8],1)[0]
        while curState[move%3][move//3]!=0 :
            move = random.sample([0,1,2,3,4,5,6,7,8],1)[0]
        if not ran:      
            #used to play against the bot during/at the end of training
            print(str(curState[0]) + "\n" + str(curState[1]) + "\n" + str(curState[2]))
            move = int(input("1-9: "))-1 
    value = [(0,0),(0,1),(0,2),(1,0),(1,1),(1,2),(2,0),(2,1),(2,2)]
    coord = value[move]
    x = coord[0] # row
    y = coord[1] # column


    if curState[x][y] != 0: # checks if the coordinate input has already been replaced
        if not ran:
            return (curState, True, -1)
        else:
            return (curState,True,1)
    if turn == True: # alternates the turn and changes the index
        curState[x][y] = 1
    else:
        curState[x][y] = 2
     # prints the game board
    if gameOver(curState) != None:
        return (curState,True,gameOver(curState))
    else:
        return (curState,False,0)


def main():

    #Trials
    trials  = 100000
    #this was int the original
    trial_len = 12
    reward = 0
    updateTargetNetwork = 1000
    dqn_agent = DQN()

    steps = []
    #Iterate over the number of trials
    for trial in range(trials):
        #Allows me to take control of the AI competing against the DQN toward the end of the trials
        if trial < trials*.9:
            random = True
        else:
            random = False
        cur_state = [[0,0,0],[0,0,0],[0,0,0]]

        #Basically I while true loop since trial_len > then the total number of turns in a tic-tac-toe game
        for step in range(trial_len):
            #Default reward, ie the it made a valid move, that didn't end the game
            reward = 2
            #We get our action from the dqn
            action = dqn_agent.act(np.asarray(flatten(cur_state)))
            if not random:
                print("Move:" + str(action+1))
            #We feed the cur state and the action into the tic-tac-toe game
            new_state,done,outcome = play(cur_state,action,True,False)
            #I save the currentstate because I need to feed it into the DQN's remember function later
            #Actually not using it right now, it occured to me that when replay was making predictions based on the model
            #It would be the model that wouldn't see the response from the opponent, as a result it would probably think some moves are valid that aren't
            #Ie it would think that playing on top of the enemy in some cases would be good because it hadn't even seen the opponent had already played there
            bot_state = cur_state
            #Not sure if this makes sense but before I update the DQN I play the opponents move, that way if the bot did something that
            #led directly to it losing we can remember that.
            if not done:
                cur_state,done,_ = play(cur_state,-1,False,random)
            #Decides the reward based on the outcome of the tic-tac-toe turn
            #Only does this if the game is over because we are assigning some reward just for valid moves so in those cases we leave it to the default
            if done:
                if outcome == 1:
                    reward=10
                elif outcome==3:
                    reward=10
                elif outcome == -1:
                    reward = -1000
                else:
                    reward = -10
            #We remember the DQN's action and states
            dqn_agent.remember(np.asarray(flatten(bot_state)), action, 
                reward, np.asarray(flatten(cur_state)), done)
            #We learn
            dqn_agent.replay()
            #We copy the current models weights over to the target
            dqn_agent.target_train()




            print(reward)

            if done:
                break


main()

再次感谢您度过的任何时间,我非常感谢。

0 个答案:

没有答案