Question

我正在努力补充一个简化的二十一点游戏的模拟，它将返回每个州的最佳政策。

二十一点模拟似乎工作正常，但在尝试应用Q学习算法达到最优策略时，我以某种方式出错。

这是我的代码，我相信它已有详细记录，错误在Q-learning块中，从〜第170行开始，它也是可重现的：

#Application reinforcement learning for black jack. We will suppose here that the croupier only has 1 pack of cards

#Initial tabs 
packinit = c(rep(1,4), rep(2,4),rep(3,4),rep(4,4),rep(5,4),rep(6,4),rep(7,4),rep(8,4),
            rep(9,4),rep(10,16))
#In our game and for simplicifaction of the problem, aces will always count as 1. Other figures are worth 10.
#If both player and croupier have same score, then player looses.
#Croupier will draw cards until he has 17 or more. 


handPinit = NULL # will contain hand of player
handCinit = NULL # will contain hand of the croupier 

list = list(handPinit, handCinit, packinit)

# Methods #################################################################################### 
##############################################################################################

#Random integer, returns an integer to choose card 
randInt = function(pack){
  int = runif(1) * length(pack)
  int = int+1
  int = round(int, 0)
  return(int)
}

#Picks a card, asimResults it to the desired hand and deletes it from the package.
pickC = function(hand, pack){

  int = randInt(pack)
  hand = c(hand, pack[int])
  pack = pack[-int]

  return(list(hand, pack))
}

score = function(handC){
  return(sum(handC, na.rm = T))
}

printWinner = function(resultList){
  res = resultList[[4]]
  p = res[1]
  c = res[2]

  if((p > c && p <= 21) || (p <= 21 && c > 21)){
    cat("Player has won with ", p, ", croupier has ", c, ".\n", sep = "")
  }else{
    cat("Player has lost with ", p, ", croupier has ", c, ".\n", sep = "")

  }
}

#Black jack sim : 
simulation = function(handP, handC, pack){

  #Matrix to stock choice and next state, 1st is state, 2nd is choice, 3rd is reward, 4th is start state
  cs = NULL

  #pick first card 
  temp = NULL
  temp = pickC(handP, pack)
  handP = temp[[1]]
  pack = temp[[2]]

  temp = pickC(handC, pack)
  handC = temp[[1]]
  pack = temp[[2]]

  #stock result
  cs = rbind(cs, c(score(handP), 1, 0.1, 0))

  #pick second card 
  temp = pickC(handP, pack)
  handP = temp[[1]]
  pack = temp[[2]]

  temp = pickC(handC, pack)
  handC = temp[[1]]
  pack = temp[[2]]

  #stock result
  cs = rbind(cs, c(score(handP), 1, 0.1, cs[length(cs[,1]), 1]))

  #reward stock final
  reward = NULL

  #to change with algo decision 
  while(score(handC) < 17){
    #rand number to choose action, 1 = draw
    rand = round(2*runif(1),0)
    #if a = 1, draw a card
    if(rand == 1 && score(handP) < 21){
      temp = pickC(handP, pack)
      handP = temp[[1]]
      pack = temp[[2]]
      cs = rbind(cs, c(score(handP), 1, 0.1, cs[length(cs[,1]), 1] ))
    }else{
      cs = rbind(cs, c(score(handP), 0, 0.1, cs[length(cs[,1]), 1]))

    }
    #if croupier < 17, he draws a card
    if(score(handC) < 17){
      temp = pickC(handC, pack)
      handC = temp[[1]]
      pack = temp[[2]]
    }
  }

  #get scores
  scores = c(score(handP), score(handC))
  resultList = list(handP, handC, pack, scores)

  #get reward
  res = resultList[[4]]
  p = res[1]
  c = res[2]
  if((p > c && p <= 21) || (p <= 21 && c > 21)){
    reward = 100
  }else{
    reward = -50
  }

  #AsimResults reward as the reward of the last line of cs
  cs[length(cs[,1]), 3] = reward

  #return full list 
  resultList = list(handP, handC, pack, scores, cs)
  return(resultList)
}

#Function for simulation, outputs tab containins states, actions and choices 
simRand = function(k){
  resultsRand = NULL
  for(i in 1:k){
    #init pack and hands
    pack = c(rep(1,4), rep(2,4),rep(3,4),rep(4,4),rep(5,4),rep(6,4),rep(7,4),rep(8,4),
                 rep(9,4),rep(10,16))
    handC = NULL
    handP = NULL
    #simulation k
    res = simulation(handP, handC, pack)
    resultsRand = rbind(resultsRand, res[[5]])

    #resets for next iteration 
    pack = c(rep(1,4), rep(2,4),rep(3,4),rep(4,4),rep(5,4),rep(6,4),rep(7,4),rep(8,4),
             rep(9,4),rep(10,16))
    handC = NULL
    handP = NULL
  }
  return(resultsRand)
}

#test 
for(i in 1:10){
  results = simulation(handPinit, handCinit, packinit)
  printWinner(results)
}
#used to max the Qvalue decision
getRowMax = function(tab){
  temp = tab[1]
  for(i in 2:length(tab)){
    if(tab[i] > temp){
      temp = tab[i]
    }
  }
}
#####################################################################
#Q-learning
#####################################################################

#Represent sets of Q(s, a)
Qvalues = matrix(1, nrow = 30, ncol = 2)
simResults = simRand(1000)
#Hyperparameters
alpha = 0.9
discount = 0.1

#for all rows simulated, update qvalues.
for(i in 1:length(simResults[,1])){
  st = simResults[i, 4] #st
  a = simResults[i, 2] #a
  stPlusOne = simResults[i, 1] #st+1
  Qvalues[st, a] = Qvalues[st, a] + alpha * ( simResults[i,3] * discount * getRowMax(Qvalues[stPlusOne, ]) - Qvalues[st, a] )
}

Answer 1

正如LucyMLi所指出的那样：

首先，您需要将return(temp)对象添加到getRowMax函数中。但是你的模拟存在另一个问题，因为有些问题 simResults[, 1]中的值为0，表示Qvalues[stPlusOne, ] 是空的，因此你无法计算getRowMax()。

Q学习R的长度为零

1 个答案: