在此使用RL的井字游戏中,如果代理没有获胜,我将对代理进行处罚。 RL特工是玩家X。该模型训练10'000集。为了确定q值,我只更新板上位置可用状态的数组索引,这对应于以下功能:
def get_valid_index(state):
i = 0
valid_index = []
for a in state :
if a == '-' and a != 'X' and a != 'O':
valid_index.append(i)
i = i + 1
return valid_index
我突破了比赛的当前训练剧集,这与给定剧集的“完成”相对应:
if reward == 1 :
q_table[state][action] += .1 * (reward - q_table[state][action])
break
完整代码:
%reset -f
import random
from functools import reduce
import pandas as pd
import numpy as np
import pandas as pd
from ast import literal_eval
import math
def epsilon_greedy(epsilon, state, q_table) :
def get_valid_index(state):
i = 0
valid_index = []
for a in state :
if a == '-' and a != 'X' and a != 'O':
valid_index.append(i)
i = i + 1
return valid_index
def get_arg_max_sub(values , indices) :
i = 0
npa = max(np.array(values)[indices])
for ii in values :
if ii == npa :
return i
i = i+1
return None
if np.random.rand() < epsilon:
return random.choice(get_valid_index(state))
else :
if state not in q_table :
q_table[state] = np.array([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])
q_row = q_table[state]
return get_arg_max_sub(q_row , get_valid_index(state))
def make_move(current_player, current_state , q_table):
if current_player == 'X':
next_move = epsilon_greedy(1, current_state , q_table)
return current_state[:next_move] + 'X' + current_state[next_move+1:]
else :
next_move = epsilon_greedy(1, current_state , q_table)
return current_state[:next_move] + 'O' + current_state[next_move+1:]
q_table = {}
max_steps = 9
def get_other_player(p):
if p == 'X':
return 'O'
else :
return 'X'
def win_by_diagonal(mark , board):
return (board[0] == mark and board[4] == mark and board[8] == mark) or (board[2] == mark and board[4] == mark and board[6] == mark)
def win_by_vertical(mark , board):
return (board[0] == mark and board[3] == mark and board[6] == mark) or (board[1] == mark and board[4] == mark and board[7] == mark) or (board[2] == mark and board[5] == mark and board[8]== mark)
def win_by_horizontal(mark , board):
return (board[0] == mark and board[1] == mark and board[2] == mark) or (board[3] == mark and board[4] == mark and board[5] == mark) or (board[6] == mark and board[7] == mark and board[8] == mark)
def win(mark , board):
return win_by_diagonal(mark, board) or win_by_vertical(mark, board) or win_by_horizontal(mark, board)
def get_reward(player, state):
reward = 0
is_win = win(player , list(state))
if is_win :
reward = 1
else:
reward = -1
return reward
stats= []
for episode in range(0 , 10000):
t = 0
state = '---------'
player = 'X'
action = epsilon_greedy(.1 , state , q_table)
if episode % 1000 == 0:
print('in episode:',episode)
while t < max_steps:
t = t + 1
next_state = make_move(player , state , q_table)
next_action = epsilon_greedy(.1 , state , q_table)
reward = get_reward('X', state)
if reward == 1 :
q_table[state][action] += .1 * (reward - q_table[state][action])
break
if next_state not in q_table :
q_table[next_state] = np.array([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])
q_table[state][action] += .1 * (reward + (.1 * q_table[next_state][next_action]) - q_table[state][action])
state, action = next_state, next_action
player = get_other_player(player)
stats.append(reward)
针对随机玩家测试代理:
x_win_1 = []
o_win_0 = []
d_game = []
game_states = []
number_games = 5000
for ii in range (0 , number_games):
if ii % 1000 == 0:
print('In game ',ii)
print('The number of X (trained RL algo) wins' , sum(x_win_1))
print('The number of O (computer making random moves) wins' , sum(o_win_0))
print('Computer RL algorithm wins' , sum(x_win_1) , 'out of total' , number_games, 'of games')
print('Computer making random moves wins' , sum(o_win_0) , 'out of total' , number_games, 'of games')
available_moves = [0,1,2,3,4,5,6,7,8]
current_game_state = '---------'
for i in range(0 , 5):
randomer_move = random.choice(available_moves)
current_game_state = current_game_state[:randomer_move] + 'O' + current_game_state[randomer_move+1:]
available_moves.remove(randomer_move)
if i == 4 :
break
computer_move_pos = epsilon_greedy(1, current_game_state, q_table)
current_game_state = current_game_state[:computer_move_pos] + 'X' + current_game_state[computer_move_pos+1:]
available_moves.remove(computer_move_pos)
is_win = win('X' , list(current_game_state))
if is_win == True:
x_win_1.append(1)
is_win = win('O' , list(current_game_state))
if is_win == True:
o_win_0.append(1)
随机玩家的获胜次数大约是训练有素的RL代理商的两倍:
In game 0
The number of X (trained RL algo) wins 0
The number of O (computer making random moves) wins 0
Computer RL algorithm wins 0 out of total 5000 of games
Computer making random moves wins 0 out of total 5000 of games
In game 1000
The number of X (trained RL algo) wins 297
The number of O (computer making random moves) wins 592
Computer RL algorithm wins 297 out of total 5000 of games
Computer making random moves wins 592 out of total 5000 of games
In game 2000
The number of X (trained RL algo) wins 589
The number of O (computer making random moves) wins 1182
Computer RL algorithm wins 589 out of total 5000 of games
Computer making random moves wins 1182 out of total 5000 of games
In game 3000
The number of X (trained RL algo) wins 864
The number of O (computer making random moves) wins 1792
Computer RL algorithm wins 864 out of total 5000 of games
Computer making random moves wins 1792 out of total 5000 of games
In game 4000
The number of X (trained RL algo) wins 1171
The number of O (computer making random moves) wins 2369
Computer RL algorithm wins 1171 out of total 5000 of games
Computer making random moves wins 2369 out of total 5000 of games
我错误地实施了td学习吗?