pytorch强化学习说明

时间:2020-08-04 15:54:15

标签: python tensorflow pytorch reinforcement-learning

我正在尝试强化学习以创建国际象棋引擎。我使用Pytorch在Python中在线找到了一些代码。我对tensorflow更为熟悉,因此有人可以帮助解释该代码的含义,并可能帮助将其“转换”为tensorflow代码吗?

此外,当前定义的模型是否足够“健壮”,足以体面?如果没有,我该如何改善?我尝试向其中添加更多nn.Linear线,但这似乎削弱了模型。

编辑:这是代码的原始链接:https://colab.research.google.com/drive/1Xk9MibJ9Fli5tIlDvo88hcZrI76rqZN5#scrollTo=ZxHEghUq9JWM

EDIT2:另外,我想弄清楚如何实现一个系统,以将其先前的结果存储到当前代码中。有人可以指出我正确的方向吗?

请注意,以下代码已从链接中调整。 改编代码:

import chess
import chess.pgn
import chess.engine
import torch
import numpy as np
import os
import torch.nn as nn
from torch.nn import functional as F
#os.remove("Games.txt")
def board_to_tensor(board):  
    # Python chess uses flattened representation of the board
    x = torch.zeros(64, dtype=torch.float)
    for pos in range(64):
        piece = board.piece_type_at(pos)
        if piece:
            color = int(bool(board.occupied_co[chess.BLACK] & chess.BB_SQUARES[pos]))
            col = int(pos % 8)
            row = int(pos / 8)
            x[row * 8 + col] = -piece if color else piece
    x = x.reshape(8, 8)        
    return x


def move_to_index_tensor(move):
    index_tensor = torch.LongTensor([0])
    square_to_pick_figure = move.from_square
    # Can decode exact position this way:
    #square_to_pick_figure_row = int(square_to_pick_figure / 8)
    #square_to_pick_figure_col = int(squre_to_pick_figure % 8)
    square_to_put_figure = move.to_square
    # Can decode exact position this way:
    #square_to_put_figure_row = int(square_to_put_figure / 8)
    #square_to_put_figure_col = int(square_to_put_figure % 8)
    index = square_to_pick_figure * 64 + square_to_put_figure
    index_tensor = torch.LongTensor([index])
    return index_tensor
  
def filter_legal_moves(legal_moves):
    filtered_legal_moves = []
    for legal_move in legal_moves:
        # Here we check if it is a promotion and
        # only leave promotion if it is a promotion to a queen
        if legal_move.promotion is not None:
            if legal_move.promotion == 5:
                filtered_legal_moves.append(legal_move)
            continue
        filtered_legal_moves.append(legal_move)
    return filtered_legal_moves


def legal_moves_to_index_tensors(legal_moves):
    legal_moves_index_tensors = [move_to_index_tensor(legal_move) for legal_move in legal_moves]
    return legal_moves_index_tensors

# The input to the network is a tensor of size 8*8 (it is flattened)
# The output of the network is 64*64 (it is flattened too)
# The size of the hidden layer should be 512

class Network(nn.Module):
    def __init__(self, number_of_actions=64*64):
        super(Network, self).__init__()
        
        # Fill up the values below in nn.Linear()
        self.layer1 = nn.Linear(64, 512)
        self.layer5 = nn.Linear(512, number_of_actions)
        
        # Initialization of weights in the layers
        nn.init.xavier_uniform_(self.layer1.weight)
        nn.init.xavier_uniform_(self.layer5.weight)
        
                
    def forward(self, x):
        x =  F.relu( self.layer1(x) ) 
        # Logits will be fed into softmax layer to get probabilities
        # for each move later.
        logits =  self.layer5(x)
        
        return logits

net = Network(number_of_actions=64*64)

def discount_rewards(collected_moves, gamma=0.99):
    running_reward = 0.0
    for index, collected_move in enumerate(reversed(collected_moves)):
        reward = collected_move[1]
        running_reward = running_reward * gamma + reward
        collected_move[1] = running_reward
  

def normalize_rewards(collected_moves):
    normalized_rewards = np.asarray(list(map(lambda x: x[1], collected_moves)), dtype=np.float)
    normalized_rewards -= np.mean(normalized_rewards)
    normalized_rewards /= np.std(normalized_rewards)

    for index, collected_move in enumerate(collected_moves):
        collected_move[1] = normalized_rewards[index]

from torch.distributions import Categorical
import random


def get_games_data(policy_net, episodes=100,d=1):
    all_moves = [] 
    lost_count = 0
    draw_count = 0
    win_count = 0
    game_lengths_sum = 0.0
    
    for episode in range(episodes):
        
        engine = chess.engine.SimpleEngine.popen_uci("stockfish-5-linux/Linux/stockfish_14053109_x64")
        engine.configure({"Clear Hash": True})
        
        board = chess.Board(fen='rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1')
        collected_moves = []
        board_sign = 1
        move_counter = 0.0
        all = ""
        while not board.is_game_over():
            if board_sign == 1:
                # Converting the board to a tensor representation
                board_tensor = board_to_tensor(board).reshape(-1)
                board_tensor_batched = board_tensor.unsqueeze(0)
                # Getting the logits output
                logits = policy_net(board_tensor_batched)
                # Now we need to select only legal moves
                # in python-chess format
                current_legal_moves = filter_legal_moves(list(board.legal_moves))
                legal_moves_index_tensors = legal_moves_to_index_tensors(current_legal_moves)
                legal_moves_logits = logits[:, legal_moves_index_tensors]
                # Here we sample the action using valid logits
                categorical_sampler = Categorical(logits=(legal_moves_logits))
                sampled_action = categorical_sampler.sample()
                sampled_action_move_object = current_legal_moves[sampled_action]
                log_prob = categorical_sampler.log_prob(sampled_action)
                board.push(sampled_action_move_object)
                #print(str(sampled_action_move_object),end = " ")
                all += str(sampled_action_move_object)
                # Board tensor, legal_moves_indexes, sampled_action_label, reward (0 if not known yet)
                collected_moves.append([log_prob, 0.0])
            else:
                result = engine.play(board, chess.engine.Limit(depth=d, nodes=3))
                board.push(result.move)
                all+=" "
                all+= str(result.move)
                all+=" "
                #print(str(result.move),end=" ")

            board_sign = board_sign * -1
            move_counter = move_counter + 1

        #print(f"\n--------------{board.result()}---------------\n")
        with open("Games.txt","a") as file:
          file.write(all)
          file.write(str(board.result()))
          file.write("\n\n\n")
        if board.is_checkmate():
          if board_sign == 1:
            reward = -1.0
            lost_count = lost_count + 1
          else:
            reward = 1.0
            win_count = win_count + 1
        if not board.is_checkmate():
          reward = 0.1
          draw_count = draw_count + 1
        game_lengths_sum = game_lengths_sum + move_counter    
        collected_moves[-1][1] = reward       
        discount_rewards(collected_moves)
        all_moves.extend(collected_moves)    
        engine.quit()
    
    average_game_length = game_lengths_sum / episodes
    stats = { "lost": lost_count,
              "draw": draw_count,
              "win": win_count,
            }
    normalize_rewards(all_moves)
    return all_moves, stats, win_count

import torch.optim as optim

net = Network(number_of_actions=64*64)
optimizer = optim.Adam(net.parameters(), lr=0.01)

checkpoint = torch.load("/content/drive/My Drive/Checkpoint2Layer.pt")
net.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

from livelossplot import PlotLosses
liveloss = PlotLosses()
# Stop it when you are happy with the displayed results
# Each iteration takes a while, be patient
dep=1
while dep<11:
  try:
    collected_moves, stats, wins = get_games_data(policy_net=net, episodes=100,d=dep)
    if int(wins)>70:
      dep+=1
    logs = [collected_move[0] for collected_move in collected_moves]
    rewards = [collected_move[1] for collected_move in collected_moves]
    logs_tensor = torch.cat(logs)
    rewards_tensor = torch.FloatTensor(rewards)
    optimizer.zero_grad()
    policy_loss = -logs_tensor * rewards_tensor
    policy_loss = policy_loss.sum()
    policy_loss.backward()    
    optimizer.step()  
    liveloss.update(stats)
    liveloss.draw()
  except KeyboardInterrupt:
    break
torch.save({
            'model_state_dict': net.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            }, "Checkpoint.pt")

#torch.save(net.state_dict(),"ChessNet.pt")

0 个答案:

没有答案