我想为TIC-TAC-TOE游戏实现RL的策略梯度算法。我看了一些例子。例如。 https://www.youtube.com/watch?v=UT9pQjVhcaU&t=724s
但是,当我针对随机玩家模拟我的策略代理时,我失去了太多游戏。我认为我误解了折扣总和或奖励,因此在计算奖励和G时会出现一些错误。我面临的问题是,在TIC-TAC-TOE中,您实际上实际上只有一个奖励(赢,输, 画)。因此,每个州实际上没有任何奖励。 这是我的代码。如果有人能找到错误,那真是太好了。
包装:
import random, math
import numpy as np
import tensorflow as tf
print(tf.__version__)
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
TIC-TAC-TOE的基本实现:
class TicTacTo:
def __init__(self):
self.printing = False
def setBoard(self):
self.board = [0] * 9
def getPlayerName(self, val):
for player_name, player_value in PLAYER.items():
if player_value == val:
return player_name
def printBoard(self):
if (self.printing):
boardDisplay = ['_'] * 9
for i, val in enumerate(self.board):
if val != 0:
boardDisplay[i] = self.getPlayerName(val)
print(boardDisplay[0] + '|' + boardDisplay[1] + '|' + boardDisplay[2])
print(boardDisplay[3] + '|' + boardDisplay[4] + '|' + boardDisplay[5])
print(boardDisplay[6] + '|' + boardDisplay[7] + '|' + boardDisplay[8])
print("\n")
def printResult(self, result):
if (self.printing):
if result == 0:
print("DRAW!")
else:
print("{} won the game!".format(self.getPlayerName(result)))
@staticmethod
def check(board):
# check for diagonals
if board[0] != 0 and board[0] == board[4] == board[8]: # check first diagonal
return board[0]
if board[2] != 0 and board[2] == board[4] == board[6]: # check second diagonal
return board[2]
# check horizontal
for n in range(3):
if (board[3*n+0] != 0) and (board[3*n+0] == board[3*n+1] == board[3*n+2]):
return board[3*n+0]
# check vertical
for i in range(3):
if (board[i] != 0) and (board[i] == board[i+3] == board[i+6]):
return board[i]
# check for a draw
if all(i != 0 for i in board):
return 0
return 2
def evaluate(self):
result = TicTacTo.check(self.board)
if result != 2: # check if game is finished
self.printResult(result)
return True
return False
"""
Player can take a move
:param player: Object of the player
:position: The position in the board where the move is set
:return: Result of the game
[1] => Player X won
[-1] => Player O won
[0] => Draw
[2] => Game is not finished
"""
def move(self, player, position):
self.board[position] = player.value
self.printBoard()
return self.evaluate()
def availableMoves(self):
empty = []
for i, val in enumerate(self.board):
if val == 0:
empty.append(i)
return empty
def simulate(self, playerA, playerB):
self.setBoard()
self.printBoard()
playerA.start()
playerB.start()
while True:
moveA = playerA.turn(self.board, self.availableMoves())
stop = self.move(playerA, moveA)
if(stop): break
moveB = playerB.turn(self.board, self.availableMoves())
stop = self.move(playerB, moveB)
if(stop): break
result = TicTacTo.check(self.board)
playerA.learn(result)
playerB.learn(result)
def simulations(self, playerA, playerB, games, printing):
self.printing = printing
x_win = 0
o_win = 0
draw = 0
for n in range(games):
self.simulate(playerA, playerB)
result = TicTacTo.check(self.board)
if (result == 0): draw += 1
elif (result == 1): x_win += 1
elif (result == -1): o_win += 1
total = x_win + o_win + draw
#print("Win X: {}%, Win O: {}%, Draw: {}%".format(100*(x_win/total), 100*(o_win/total), 100*(draw/total)))
return x_win, o_win, draw
评估两个玩家之间的游戏:
def evaluu(game, playerA, playerB, num_battles, games_per_battle = 100):
x_wins = []
o_wins = []
draws = []
game_number = []
game_counter = 0
for i in range(num_battles):
xwin, owin, draw = game.simulations(playerA, playerB, games_per_battle, False)
total = xwin + owin + draw
print("End Win X: {}%, Win O: {}%, Draw: {}%".format(100*(xwin/total), 100*(owin/total), 100*(draw/total)))
print("Round: ", game_counter)
x_wins.append(xwin*100.0/games_per_battle)
o_wins.append(owin*100.0/games_per_battle)
draws.append(draw*100.0/games_per_battle)
game_counter=game_counter+1
game_number.append(game_counter)
plt.ylabel('Game outcomes in %')
plt.xlabel('Game number')
plt.plot(game_number, draws, 'r-', label='Draw')
plt.plot(game_number, x_wins, 'g-', label='Player X wins')
plt.plot(game_number, o_wins, 'b-', label='Player O wins')
plt.legend(loc='best', shadow=True, fancybox=True, framealpha =0.7)
随机播放器类:
PLAYER = {"X": 1, "O": -1}
class RandomPlayer:
def __init__(self, player_name):
self.name = player_name
self.value = PLAYER[self.name]
def start(self):
pass
def turn(self, board, availableMoves):
return availableMoves[random.randrange(0, len(availableMoves))]
def learn(self, result):
pass
策略代理的神经网络:
class NeuralNetwork():
def __init__(self, learning_rate = 0.0001):
self.model = tf.keras.Sequential()
self.learning_rate = learning_rate
self.hidden_layer = tf.keras.layers.Dense(243, activation=tf.nn.relu, input_dim=27)
self.output_layer = tf.keras.layers.Dense(9)
self.model.add(self.hidden_layer)
self.model.add(self.output_layer)
#self.model.build()
def training(self, board_state_memory, action_state_memory, G):
output = []
for board_state in board_state_memory:
output.append(self.model.predict(x=board_state)[0])
#print(np.array(loit).shape)
#print(np.array(action_state_memory).shape)
def loss():
with tf.compat.v1.Session() as sess:
neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=output, labels=action_state_memory, name=None)
sess.run(neg_log_prob)
#print(neg_log_prob * G)
return neg_log_prob * G
self.optimizer = tf.keras.optimizers.Adam(learning_rate=self.learning_rate)
self.optimizer.minimize(loss, var_list=self.model.weights)
def predict(self, board_state):
output = self.model.predict(x=board_state)
actions = tf.nn.softmax(output)
return actions[0]
策略代理类:
class PolicyAgent:
def __init__(self, player_name):
self.name = player_name
self.value = PLAYER[self.name]
def board_to_input(self, board):
input_ = np.array([0] * 27)
for i, val in enumerate(board):
if val == self.value:
input_[i] = 1
if val == self.value * -1:
input_[i+9] = 1
if val == 0:
input_[i+18] = 1
return np.reshape(input_, (1,-1))
def start(self, learning_rate=0.001, gamma=0.1):
self.learning_rate = learning_rate
self.gamma = gamma
self.moves = list(range(0,9))
self.state_memory = []
self.action_memory = []
self.reward = [] #just one reward at end
self.nn = NeuralNetwork(self.learning_rate)
def turn(self, board, availableMoves):
actions_prob = self.nn.predict(self.board_to_input(board))
actions_prob = np.array(actions_prob)
actions_prob /= actions_prob.sum() # normalize
move = np.random.choice(self.moves, p=actions_prob)
while move not in availableMoves:
move = np.random.choice(self.moves, p=actions_prob)
#print("Move: ", move)
self.state_memory.append(self.board_to_input(board.copy()))
self.action_memory.append(move)
return move
def calculate_rewards(self, end_reward):
discounted_r = np.zeros(len(self.action_memory))
running_add = end_reward
for t in reversed(range(0, len(self.action_memory))):
discounted_r[t] = running_add
running_add = running_add * self.gamma
return discounted_r.tolist()
def calculateG(self, discounted_r):
G = np.zeros_like(discounted_r)
for t in range(len(discounted_r)):
G_sum = 0
discount = 1
for k in range(t, len(discounted_r)):
G_sum += discounted_r[k] * discount
discount *= self.gamma
G[t] = G_sum
mean = np.mean(G)
std = np.std(G) if np.std(G) > 0 else 1
G = (G-mean) / std
return G
def learn(self, result):
if result == 0:
reward = 0.5
elif result == self.value:
reward = 1
else:
reward = 0
discounted_r = self.calculate_rewards(reward)
#print("discounted_r value: ", discounted_r)
G = self.calculateG(discounted_r)
#print("G value: ", G)
self.nn.training(self.state_memory, self.action_memory, G)
self.state_memory = []
self.action_memory = []
self.reward = [] #just one reward at end
我在随机代理人和政策代理人之间的测试:
PLAYER = {"X": 1, "O": -1}
player20 = PolicyAgent("O")
u = TicTacTo()
#u.simulations(RandomPlayer("X"), player20, 5, False)
evaluu(u, RandomPlayer("X"), player20, 5)
结果:
End Win X: 57.99999999999999%, Win O: 28.000000000000004%, Draw: 14.000000000000002%
Round: 0
End Win X: 56.99999999999999%, Win O: 24.0%, Draw: 19.0%
Round: 1
End Win X: 59.0%, Win O: 24.0%, Draw: 17.0%
Round: 2
End Win X: 60.0%, Win O: 34.0%, Draw: 6.0%
Round: 3
End Win X: 57.99999999999999%, Win O: 30.0%, Draw: 12.0%
Round: 4
在这里可以看到X(随机播放器)以更高的百分比赢得了所有比赛。实际上,它应该坐下来是因为代理可以学习策略,但是它没有这样做。