Question

在此使用RL的井字游戏中，如果代理没有获胜，我将对代理进行处罚。 RL特工是玩家X。该模型训练10'000集。为了确定q值，我只更新板上位置可用状态的数组索引，这对应于以下功能：

def get_valid_index(state):
    i = 0
    valid_index = []
    for a in state :          
        if a == '-' and a != 'X' and a != 'O':
            valid_index.append(i)
        i = i + 1
    return valid_index

我突破了比赛的当前训练剧集，这与给定剧集的“完成”相对应：

if reward == 1 :
    q_table[state][action] += .1 * (reward - q_table[state][action])
    break

完整代码：

   %reset -f

import random
from functools import reduce
import pandas as pd
import numpy as np
import pandas as pd
from ast import literal_eval
import math

def epsilon_greedy(epsilon, state, q_table) : 

    def get_valid_index(state):
        i = 0
        valid_index = []
        for a in state :          
            if a == '-' and a != 'X' and a != 'O':
                valid_index.append(i)
            i = i + 1
        return valid_index

    def get_arg_max_sub(values , indices) : 
        i = 0
        npa = max(np.array(values)[indices])
        for ii in values : 
            if ii == npa : 
                return i
            i = i+1
        return None

    if np.random.rand() < epsilon:
        return random.choice(get_valid_index(state))
    else :
        if state not in q_table : 
            q_table[state] = np.array([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])
        q_row = q_table[state]
        return get_arg_max_sub(q_row , get_valid_index(state))

def make_move(current_player, current_state , q_table):
    if current_player == 'X':
        next_move = epsilon_greedy(1, current_state , q_table)
        return current_state[:next_move] + 'X' + current_state[next_move+1:]
    else : 
        next_move = epsilon_greedy(1, current_state , q_table)
        return current_state[:next_move] + 'O' + current_state[next_move+1:]

q_table = {}
max_steps = 9

def get_other_player(p):
    if p == 'X':
        return 'O'
    else : 
        return 'X'

def win_by_diagonal(mark , board):
    return (board[0] == mark and board[4] == mark and board[8] == mark) or (board[2] == mark and board[4] == mark and board[6] == mark)

def win_by_vertical(mark , board):
    return (board[0] == mark and board[3] == mark and board[6] == mark) or (board[1] == mark and board[4] == mark and board[7] == mark) or (board[2] == mark and board[5] == mark and board[8]== mark)

def win_by_horizontal(mark , board):
    return (board[0] == mark and board[1] == mark and board[2] == mark) or (board[3] == mark and board[4] == mark and board[5] == mark) or (board[6] == mark and board[7] == mark and board[8] == mark)

def win(mark , board):
    return win_by_diagonal(mark, board) or win_by_vertical(mark, board) or win_by_horizontal(mark, board)

def get_reward(player, state):
    reward = 0
    is_win = win(player , list(state))
    if is_win : 
        reward = 1
    else:
        reward = -1

    return reward

stats= []

for episode in range(0 , 10000):
    t = 0
    state = '---------'
    player = 'X'
    action = epsilon_greedy(.1 , state , q_table)
    if episode % 1000 == 0:
        print('in episode:',episode)
    while t < max_steps:

        t = t + 1

        next_state = make_move(player , state , q_table)
        next_action = epsilon_greedy(.1 , state , q_table)
        reward = get_reward('X', state)
        if reward == 1 :
            q_table[state][action] += .1 * (reward - q_table[state][action])
            break

        if next_state not in q_table : 
            q_table[next_state] = np.array([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])

        q_table[state][action] += .1 * (reward + (.1 * q_table[next_state][next_action]) - q_table[state][action])
        state, action = next_state, next_action

        player = get_other_player(player)

        stats.append(reward)

针对随机玩家测试代理：

x_win_1 = []
o_win_0 = []
d_game = []
game_states = []
number_games = 5000

for ii in range (0 , number_games):

    if ii % 1000 == 0:
        print('In game ',ii)
        print('The number of X (trained RL algo) wins' , sum(x_win_1))
        print('The number of O (computer making random moves) wins' , sum(o_win_0))

        print('Computer RL algorithm wins' , sum(x_win_1) , 'out of total' , number_games, 'of games')
        print('Computer making random moves wins' , sum(o_win_0) , 'out of total' , number_games, 'of games')


    available_moves = [0,1,2,3,4,5,6,7,8]
    current_game_state = '---------'

    for i in range(0 , 5):

        randomer_move = random.choice(available_moves)
        current_game_state = current_game_state[:randomer_move] + 'O' + current_game_state[randomer_move+1:]
        available_moves.remove(randomer_move)

        if i == 4 : 
            break

        computer_move_pos = epsilon_greedy(1, current_game_state, q_table)
        current_game_state = current_game_state[:computer_move_pos] + 'X' + current_game_state[computer_move_pos+1:]
        available_moves.remove(computer_move_pos)

    is_win = win('X' , list(current_game_state))   
    if is_win == True:
        x_win_1.append(1)

    is_win = win('O' , list(current_game_state))
    if is_win == True:
        o_win_0.append(1)

随机玩家的获胜次数大约是训练有素的RL代理商的两倍：

In game  0
The number of X (trained RL algo) wins 0
The number of O (computer making random moves) wins 0
Computer RL algorithm wins 0 out of total 5000 of games
Computer making random moves wins 0 out of total 5000 of games
In game  1000
The number of X (trained RL algo) wins 297
The number of O (computer making random moves) wins 592
Computer RL algorithm wins 297 out of total 5000 of games
Computer making random moves wins 592 out of total 5000 of games
In game  2000
The number of X (trained RL algo) wins 589
The number of O (computer making random moves) wins 1182
Computer RL algorithm wins 589 out of total 5000 of games
Computer making random moves wins 1182 out of total 5000 of games
In game  3000
The number of X (trained RL algo) wins 864
The number of O (computer making random moves) wins 1792
Computer RL algorithm wins 864 out of total 5000 of games
Computer making random moves wins 1792 out of total 5000 of games
In game  4000
The number of X (trained RL algo) wins 1171
The number of O (computer making random moves) wins 2369
Computer RL algorithm wins 1171 out of total 5000 of games
Computer making random moves wins 2369 out of total 5000 of games

我错误地实施了td学习吗？

井字游戏不学习获胜

0 个答案: