Question

我正在尝试基于this论文在Python中创建时间差异学习的实现（警告：链接下载PDF）。尽管如此，我似乎无法让它融合到最优政策中。我想知道我的实现是否正在计算错误的东西，因为我不是一个编码器。这是我的代码：

class QLearn:
    # Initialize the agent. Actions is a list of allowable prices.
    def __init__(self, actions=[1,2,3,4,5,6,7,8,9,10], state=15, action=1, discount=0.999, lamb=0.9):
        self.actions = actions
        self.state = state
        self.action = action
        self.discount = discount
        self.lamb = lamb
        self.visits = {}
        self.history = {}

    # Gets the reward realized at a given state and action, along with the new state
    def getNextState(self, sold):
        newstate = self.state - sold
        if newstate < 0:
            newstate = 0
        return newstate

    # Multiply sold and price to get revenue reward
    def getReward(self, sold):
        if sold > self.state:
            reward = self.state * self.action
        else:
            reward = self.action * sold
        return reward

    # Gets the q value for a given state for the given k and t
    def getQ(self, k, t, state, action):
        return self.q[(k, t)].get((state, action), 0.0)

    # Gets the eligibility trace for a given state for the given k and t
    def getET(self, k, t, state,action):
        return self.et[(k, t)].get((state, action), 0.0)

    # Choose which action to take in the next time period
    def chooseAction(self, k, t):
        # Decide if to explore or exploit
        if random.random() < (1/float(k)):
            # Exploring, so a random action is chosen
            newaction = random.choice(self.actions)
            explore = True
            self.history[t] = (self.state, self.action)
        else:
            # Get a list of Q values for every action
            q = [self.getQ(k, t+1, self.nextState, action) for action in self.actions]
            maxQ = max(q)
            count = q.count(maxQ)
            # If there are two Q values that are the same, choose a random one
            if count > 1:
                best = [i for i in range(len(self.actions)) if q[i] == maxQ]
                i = random.choice(best)
                explore = False
            else:
                i = q.index(maxQ)
                explore = False
            newaction = self.actions[i]
            # Record decision for k and t in history
            self.history[t] = (self.state, self.action)
        return newaction, explore

    # Calculate the temporal difference error
    def TDError(self, k, t):
        if t == 10:
            delta = self.reward + self.discount * 0 - self.getQ(k, t, self.state, self.action)
        else:
            delta = self.reward + self.discount * self.getQ(k, t+1, self.nextState,
                        self.nextAction) - self.getQ(k, t, self.state, self.action)
        return delta

    # Update the eligibility trace for the current state
    def updateCurrentET(self, k, t, state, action):
        self.et[(k, t)][(state, action)] = 1

    # Update the Q values for the next episode for all state-action pairs used during this episode
    def updateQs(self, k, t):
        for i in range(t, 0, -1):
            self.q[(k+1, t)][self.history[i]] = self.q[(k, i)].get(self.history[i], 0) + \
                (1 / float(1 + self.visits.get(self.history[i], 0))) * self.tdError * \
                self.et[t].get(self.history[i], 0)

    # Update the eligibility traces for all state-action pairs used in the episode
    def updateETs(self, k, t):
        # Set all to zero if exploration
        if self.explore:
            for i in range(t, 0, -1):
                self.et[t+1][self.history[i]] = 0
        else:
            for i in range(t-1, 0, -1):
                self.et[t+1][self.history[i]] = self.lamb * self.et[t].get(self.history[i], 0.0)

    # Assign the next state and action to the current state and action
    def stepForward(self):
        self.state = self.nextState
        self.action = self.nextAction

以下是我作为模拟使用的内容。

import random
from qlearn.qlearn import QLearn, AutoVivification

# This simulates the process by which sales are realized and applies the Q-Learning algorithm to it
# Each series of episodes is repeated 1000 times in order to find the average revenue realized across all episodes

def main():
    # Define periods and episodes, instantiate the agent
    periods = 10
    episodes = 200
    agent = QLearn()
    revHistory = []
    firstSold = getSold(1,agent.action,agent.state)
    for k in range(1,episodes+1,1):
        revenue = 0
        # Process for first episode
        if k == 1:
            for t in range(1,periods+1,1):
                # Process for first period and episode
                if t == 1:
                    revenue = revenue + firstPeriodandEpisode(agent, firstSold)
                # Process for last period
                elif t == periods:
                    revenue = revenue + lastPeriod(agent, k, t)
                # Process for all other periods
                else:
                    revenue = revenue + update(agent, k, t)
        else:
            for t in range(1,periods+1,1):
                # Process for first period
                if t == 1:
                    revenue = revenue + firstPeriod(agent, k, t)
                # Process for last period
                elif t == periods:
                    revenue = revenue + lastPeriod(agent, k, t)
                # Process for all other periods
                else:
                    revenue = revenue + update(agent, k, t)
        # Updates number of visits to each state-action pair at the end each episode
        for i in range(1,periods+1,1):
            agent.visits[agent.history[i]] = agent.visits.get(agent.history[i],0) + 1
        revHistory.append(revenue)
    # Returns the final revenue realized 
    return revHistory[-1]

def firstPeriodandEpisode(agent, sold):
    # Creates dictionaries for eligibility traces and Q values
    agent.et = AutoVivification()
    agent.q = AutoVivification()
    # Gets the next state based on number sold
    agent.nextState = agent.getNextState(sold)
    # Calculates the reward by multiplying the number sold by the price
    agent.reward = agent.getReward(sold)
    # Chooses the next action by finding the maximum Q value
    agent.nextAction, agent.explore = agent.chooseAction(k=1, t=1)
    # Calculates the temporal difference error for the action
    agent.tdError = agent.TDError(k=1,t=1)
    # Assigns the number 1 to the current eligibility trace for the current state and action
    agent.et[1][(agent.state,agent.action)] = 1
    # Updates all Q values
    agent.updateQs(k=1,t=1)
    # Updates all eligibility traces
    agent.updateETs(k=1,t=1)
    # Advances time one step
    agent.stepForward()
    return agent.reward

def update(agent, k, t):
    # Gets number sold based on the equation in getSold()
    periodSold = getSold(t, agent.action, agent.state)
    # Gets the next state based on number sold
    agent.nextState = agent.getNextState(periodSold)
    # Calculates the reward by multiplying the number sold by the price
    agent.reward = agent.getReward(periodSold)
    # Chooses the next action by finding the maximum Q value
    agent.nextAction, agent.explore = agent.chooseAction(k, t)
    # Calculates the temporal difference error for the action
    agent.tdError = agent.TDError(k,t)
    # Assigns the number 1 to the current eligibility trace for the current state and action
    agent.et[t][(agent.state,agent.action)] = 1
    # Updates all Q values
    agent.updateQs(k,t)
    # Updates all eligibility traces
    agent.updateETs(k,t)
    # Advances time one step
    agent.stepForward()
    return agent.reward

def lastPeriod(agent, k, t):
    # Gets number sold based on the equation in getSold()
    periodSold = getSold(t, agent.action, agent.state)
    # Gets the next state based on number sold
    agent.nextState = agent.getNextState(periodSold)
    # Calculates the reward by multiplying the number sold by the price
    agent.reward = agent.getReward(periodSold)
    # Chooses the next action by finding the maximum Q value
    agent.nextAction, agent.explore = agent.chooseAction(k, t)
    # Calculates the temporal difference error for the action
    agent.tdError = agent.TDError(k,t)
    # Assigns the number 1 to the current eligibility trace for the current state and action
    agent.et[t][(agent.state,agent.action)] = 1
    # Updates all Q values
    agent.updateQs(k,t)
    # Updates all eligibility traces
    agent.updateETs(k,t)
    # Gets the action for the beginning of the next episode and makes it the current state
    agent.action, agent.explore = agent.nextEpisodeAction(k)
    # Sets current state to the beginning state
    agent.state = 15
    return agent.reward

def firstPeriod(agent, k, t):
    # All eligibility traces to 0
    agent.et = AutoVivification()
    # Clear state-action history
    agent.history = {}
    # Gets number sold based on the equation in getSold()
    periodSold = getSold(t, agent.action, agent.state)
    # Gets the next state based on number sold
    agent.nextState = agent.getNextState(periodSold)
    # Calculates the reward by multiplying the number sold by the price
    agent.reward = agent.getReward(periodSold)
    # Chooses the next action by finding the maximum Q value
    agent.nextAction, agent.explore = agent.chooseAction(k, t)
    # Calculates the temporal difference error for the action
    agent.tdError = agent.TDError(k,t)
    # Assigns the number 1 to the current eligibility trace for the current state and action
    agent.et[t][(agent.state,agent.action)] = 1
    # Updates all Q values
    agent.updateQs(k,t)
    # Updates all eligibility traces
    agent.updateETs(k,t)
    # Advances time one step
    agent.stepForward()
    return agent.reward

# Gets the number sold base on the time period, the action, and the state   
def getSold(t, action, state):
    if state == 0:
        sold = 0
    else:
        # Equation that simulates the amount sold
        sold = .3 + .25*t - .014*action
        if sold < 0:
            sold = 0
    return int(round(sold,0))   

if __name__ == "__main__":
    simRev = []
    for i in range(1,1001,1):
        simRev.append(main())
    # Prints the average final revenue over all simulations
    print reduce(lambda x, y: x + y, simRev) / float(len(simRev))

它最终吐出的值应该接近150，因为这是它应该找到的最大值，但由于某种原因，无论我尝试什么，它都会被困在80左右。

时间差异学习的Python实现不接近最优

0 个答案: