Question

我正在使用Sutton＆amp;巴托的电子书强化学习：简介来研究强化学习。我在试图模仿action-value page上的结果（图）时遇到了一些问题。

更具体地说，如何模拟每项任务的greedy值？这本书说：

...我们可以绘制各种方法的性能和行为他们的经验超过1000次......

所以我想我必须跟踪探索性值，因为找到了更好的值。问题是如何使用贪心方法做到这一点 - 因为没有探索性的动作，我怎么知道什么是贪婪的行为？

感谢所有的评论和答案！

更新：请参阅我的答案中的代码。

Answer 1

我终于做对了。正如书中所指出的那样， eps player 应该击败贪婪的玩家，因为这是探索性的动作。代码很慢，需要一些优化，但这里是：

enter image description here

get.testbed = function(arms = 10, plays = 500, u = 0, sdev.arm = 1, sdev.rewards = 1){

  optimal = rnorm(arms, u, sdev.arm)
  rewards = sapply(optimal, function(x)rnorm(plays, x, sdev.rewards))

  list(optimal = optimal, rewards = rewards)
}

play.slots = function(arms = 10, plays = 500, u = 0, sdev.arm = 1, sdev.rewards = 1, eps = 0.1){

  testbed = get.testbed(arms, plays, u, sdev.arm, sdev.rewards)
  optimal = testbed$optimal
  rewards = testbed$rewards

  optim.index = which.max(optimal)
  slot.rewards = rep(0, arms)
  reward.hist = rep(0, plays)
  optimal.hist = rep(0, plays)
  pulls = rep(0, arms)
  probs = runif(plays)

  # vetorizar
  for (i in 1:plays){

      ## dont use ifelse() in this case
      ## idx = ifelse(probs[i] < eps, sample(arms, 1), which.max(slot.rewards))

      idx = if (probs[i] < eps) sample(arms, 1) else which.max(slot.rewards)
      reward.hist[i] = rewards[i, idx]

      if (idx == optim.index)
        optimal.hist[i] = 1

      slot.rewards[idx] = slot.rewards[idx] + (rewards[i, idx] - slot.rewards[idx])/(pulls[idx] + 1)
      pulls[idx] = pulls[idx] + 1
  }

  list(slot.rewards = slot.rewards, reward.hist = reward.hist, optimal.hist = optimal.hist, pulls = pulls)
}

do.simulation = function(N = 100, arms = 10, plays = 500, u = 0, sdev.arm = 1, sdev.rewards = 1, eps = c(0.0, 0.01, 0.1)){

  n.players = length(eps)
  col.names = paste('eps', eps)
  rewards.hist = matrix(0, nrow = plays, ncol = n.players)
  optim.hist = matrix(0, nrow = plays, ncol = n.players)
  colnames(rewards.hist) = col.names
  colnames(optim.hist) = col.names

  for (p in 1:n.players){
    for (i in 1:N){
      play.results = play.slots(arms, plays, u, sdev.arm, sdev.rewards, eps[p])
      rewards.hist[, p] = rewards.hist[, p] + play.results$reward.hist
      optim.hist[, p] = optim.hist[, p] + play.results$optimal.hist
    } 
  }

  rewards.hist = rewards.hist/N
  optim.hist = optim.hist/N
  optim.hist = apply(optim.hist, 2, function(x)cumsum(x)/(1:plays))

  ### Plot helper ###
  plot.result = function(x, n.series, colors, leg.names, ...){
    for (i in 1:n.series){
      if (i == 1)
        plot.ts(x[, i], ylim = 2*range(x), col = colors[i], ...)
      else
        lines(x[, i], col = colors[i], ...)
      grid(col = 'lightgray')
    }
    legend('topleft', leg.names, col = colors, lwd = 2, cex = 0.6, box.lwd = NA)
  }
  ### Plot helper ###

  #### Plots ####
  require(RColorBrewer)
  colors = brewer.pal(n.players + 3, 'Set2')
  op <-par(mfrow = c(2, 1), no.readonly = TRUE)

  plot.result(rewards.hist, n.players, colors, col.names, xlab = 'Plays', ylab = 'Average reward', lwd = 2)
  plot.result(optim.hist, n.players, colors, col.names, xlab = 'Plays', ylab = 'Optimal move %', lwd = 2)
  #### Plots ####

  par(op)
}

要运行它，只需致电

do.simulation(N = 100, arms = 10, eps = c(0, 0.01, 0.1))

Answer 2

您还可以选择使用R软件包“ py4j claims otherwise...”，该软件包的目的是简化上下文无关（如Sutton＆Barto中所述）和上下文（如{ {3}} contextual）多武装强盗政策。

有关如何复制所有Sutton＆Barto强盗情节的软件包实际上example。例如，要生成ε贪婪图，只需针对LinUCB模拟offers a vignette策略：

library(contextual)

set.seed(2)
mus             <- rnorm(10, 0, 1)
sigmas          <- rep(1, 10)
bandit          <- BasicGaussianBandit$new(mu_per_arm = mus, sigma_per_arm = sigmas)

agents          <- list(Agent$new(EpsilonGreedyPolicy$new(0),    bandit, "e = 0, greedy"),
                        Agent$new(EpsilonGreedyPolicy$new(0.1),  bandit, "e = 0.1"),
                        Agent$new(EpsilonGreedyPolicy$new(0.01), bandit, "e = 0.01"))

simulator       <- Simulator$new(agents = agents, horizon = 1000, simulations = 2000)
history         <- simulator$run()

plot(history, type = "average", regret = FALSE, lwd = 1, legend_position = "bottomright")
plot(history, type = "optimal", lwd = 1, legend_position = "bottomright")

EpsilonGreedy

Gaussian bandit

完全公开：我是该软件包的开发者之一。

Answer 3

这是我到目前为止基于聊天的内容：

set.seed(1)

getRewardsGaussian <- function(arms, plays) {
## assuming each action has a normal distribution 

  # first generate new means
  QStar <- rnorm(arms, 0, 1)

  # then for each mean, generate `play`-many samples
  sapply(QStar, function(u)
    rnorm(plays, u, 1))
}


CalculateRewardsPerMethod <- function(arms=7, epsi1=0.01, epsi2=0.1
                    , plays=1000, methods=c("greedy", "epsi1", "epsi2")) {

  # names for easy handling
  names(methods) <- methods
  arm.names <- paste0("Arm", ifelse((1:arms)<10, 0, ""), 1:arms)

  # this could be different if not all actions' rewards have a gaussian dist.
  rewards.source <- getRewardsGaussian(arms, plays) 

  # Three dimensional array to track running averages of each method
  running.avgs <- 
    array(0, dim=c(plays, arms, length(methods))
           , dimnames=list(PlayNo.=NULL, Arm=arm.names, Method=methods))

  # Three dimensional array to track the outcome of each play, according to each method 
  rewards.received <- 
    array(NA_real_, dim=c(plays, 2, length(methods))
                  , dimnames=list(PlayNo.=seq(plays), Outcome=c("Arm", "Reward"), Method=methods))


  # define the function internally to not have to pass running.avgs 
  chooseAnArm <- function(p) {
    # Note that in a tie, which.max returns the lowest value, which is what we want
    maxes <- apply(running.avgs[p, ,methods, drop=FALSE], 3, which.max)

    # Note: deliberately drawing two separate random numbers and keeping this as 
    #       two lines of code to accent that the two draws should not be related 
    if(runif(1) < epsi1)
      maxes["epsi1"] <- sample(arms, 1)

    if(runif(1) < epsi2)
      maxes["epsi2"] <- sample(arms, 1)

    return(maxes)
  }

  ## TODO:  Perform each action at least once, then select according to algorithm
  ## Starting points. Everyone starts at machine 3
  choice <- c(3, 3, 3)
  reward <- rewards.source[1, choice]
  ## First run, slightly different
  rewards.received[1,,] <- rbind(choice, reward)
  running.avgs[1, choice, ] <- reward # if different starting points, this needs to change like below

  ## HERE IS WHERE WE START PULLING THE LEVERS ##
  ## ----------------------------------------- ##
  for (p in 2:plays) {
    choice <- chooseAnArm(p)
    reward <- rewards.source[p, choice]

    # Note: When dropping a dim, the methods will be the columns 
    #       and the Outcome info will be the rows. Use `rbind` instead of `cbind`.
    rewards.received[p,,names(choice)] <- rbind(choice, reward)

    ## Update the running averages. 
    ## For each method, the current running averages are the same as the
    ##    previous for all arms, except for the one chosen this round.
    ##    Thus start with last round's averages, then update the one arm.
    running.avgs[p,,] <- running.avgs[p-1,,]

    # The updating is only involved part (due to lots of array-indexing)
    running.avgs[p,,][cbind(choice, 1:3)] <- 
     sapply(names(choice), function(m) 
       # Update the running average for the selected arm (for the current play & method) 
          mean( rewards.received[ 1:p,,,drop=FALSE][ rewards.received[1:p,"Arm",m] == choice[m],"Reward",m])
     )
  } # end for-loop


  ## DIFFERENT RETURN OPTIONS ##
  ## ------------------------ ##


  ## All rewards received, in simplifed matrix (dropping information on arm chosen)
  # return(rewards.received[, "Reward", ])

  ## All rewards received, along with which arm chosen: 
  #   return(rewards.received)

  ## Running averages of the rewards received by method
  return( apply(rewards.received[, "Reward", ], 2, cumsum) / (1:plays) )

}


### EXECUTION (AND SIMULATION)

## PARAMETERS
arms   <- 10
plays  <- 1000
epsi1  <- 0.01
epsi2  <- 0.1
simuls <- 50  # 2000
methods=c("greedy", "epsi1", "epsi2")

## Single Iteration: 
### we can run system time to get an idea for how long one will take
tme <- system.time( CalculateRewardsPerMethod(arms=arms, epsi1=epsi1, epsi2=epsi2, plays=plays) )
cat("Expected run time is approx: ", round((simuls * tme[["elapsed"]]) / 60, 1), " minutes")

## Multiple iterations (simulations)
rewards.received.list <- replicate(simuls, CalculateRewardsPerMethod(arms=arms, epsi1=epsi1, epsi2=epsi2, plays=plays), simplify="array")

## Compute average across simulations
rewards.received <- apply(rewards.received.list, 1:2, mean)

## RESULTS
head(rewards.received, 17)
MeanRewards <- rewards.received

## If using an alternate return method in `Calculate..` use the two lines below to calculate running avg
#   CumulRewards <- apply(rewards.received, 2, cumsum)
#   MeanRewards  <- CumulRewards / (1:plays)

## PLOT
plot.ts(MeanRewards[, "greedy"], col = 'red', lwd = 2, ylim = range(MeanRewards), ylab = 'Average reward', xlab="Plays")
  lines(MeanRewards[, "epsi1"], col = 'orange', lwd = 2)
  lines(MeanRewards[, "epsi2"], col = 'navy', lwd = 2)
  grid(col = 'darkgray')

  legend('bottomright', c('greedy', paste("epsi1 =", epsi1), paste("epsi2 =", epsi2)), col = c('red', 'orange', 'navy'), lwd = 2, cex = 0.8)

enter image description here

Answer 4

您可能还想查看此链接 https://www.datahubbs.com/multi_armed_bandits_reinforcement_learning_1/

从上述来源复制相关代码它不使用R，而只使用来自numpy

的np.random.rand（）

class eps_bandit:
'''
epsilon-greedy k-bandit problem

Inputs
=====================================================
k: number of arms (int)
eps: probability of random action 0 < eps < 1 (float)
iters: number of steps (int)
mu: set the average rewards for each of the k-arms.
    Set to "random" for the rewards to be selected from
    a normal distribution with mean = 0. 
    Set to "sequence" for the means to be ordered from 
    0 to k-1.
    Pass a list or array of length = k for user-defined
    values.
'''

def __init__(self, k, eps, iters, mu='random'):
    # Number of arms
    self.k = k
    # Search probability
    self.eps = eps
    # Number of iterations
    self.iters = iters
    # Step count
    self.n = 0
    # Step count for each arm
    self.k_n = np.zeros(k)
    # Total mean reward
    self.mean_reward = 0
    self.reward = np.zeros(iters)
    # Mean reward for each arm
    self.k_reward = np.zeros(k)

    if type(mu) == list or type(mu).__module__ == np.__name__:
        # User-defined averages            
        self.mu = np.array(mu)
    elif mu == 'random':
        # Draw means from probability distribution
        self.mu = np.random.normal(0, 1, k)
    elif mu == 'sequence':
        # Increase the mean for each arm by one
        self.mu = np.linspace(0, k-1, k)

def pull(self):
    # Generate random number
    p = np.random.rand()
    if self.eps == 0 and self.n == 0:
        a = np.random.choice(self.k)
    elif p < self.eps:
        # Randomly select an action
        a = np.random.choice(self.k)
    else:
        # Take greedy action
        a = np.argmax(self.k_reward)

    reward = np.random.normal(self.mu[a], 1)

    # Update counts
    self.n += 1
    self.k_n[a] += 1

    # Update total
    self.mean_reward = self.mean_reward + (
        reward - self.mean_reward) / self.n

    # Update results for a_k
    self.k_reward[a] = self.k_reward[a] + (
        reward - self.k_reward[a]) / self.k_n[a]

def run(self):
    for i in range(self.iters):
        self.pull()
        self.reward[i] = self.mean_reward

def reset(self):
    # Resets results while keeping settings
    self.n = 0
    self.k_n = np.zeros(k)
    self.mean_reward = 0
    self.reward = np.zeros(iters)
    self.k_reward = np.zeros(k)

R中的n臂强盗模拟

4 个答案: