我正在尝试基于this论文在Python中创建时间差异学习的实现(警告:链接下载PDF)。尽管如此,我似乎无法让它融合到最优政策中。我想知道我的实现是否正在计算错误的东西,因为我不是一个编码器。这是我的代码:
class QLearn:
# Initialize the agent. Actions is a list of allowable prices.
def __init__(self, actions=[1,2,3,4,5,6,7,8,9,10], state=15, action=1, discount=0.999, lamb=0.9):
self.actions = actions
self.state = state
self.action = action
self.discount = discount
self.lamb = lamb
self.visits = {}
self.history = {}
# Gets the reward realized at a given state and action, along with the new state
def getNextState(self, sold):
newstate = self.state - sold
if newstate < 0:
newstate = 0
return newstate
# Multiply sold and price to get revenue reward
def getReward(self, sold):
if sold > self.state:
reward = self.state * self.action
else:
reward = self.action * sold
return reward
# Gets the q value for a given state for the given k and t
def getQ(self, k, t, state, action):
return self.q[(k, t)].get((state, action), 0.0)
# Gets the eligibility trace for a given state for the given k and t
def getET(self, k, t, state,action):
return self.et[(k, t)].get((state, action), 0.0)
# Choose which action to take in the next time period
def chooseAction(self, k, t):
# Decide if to explore or exploit
if random.random() < (1/float(k)):
# Exploring, so a random action is chosen
newaction = random.choice(self.actions)
explore = True
self.history[t] = (self.state, self.action)
else:
# Get a list of Q values for every action
q = [self.getQ(k, t+1, self.nextState, action) for action in self.actions]
maxQ = max(q)
count = q.count(maxQ)
# If there are two Q values that are the same, choose a random one
if count > 1:
best = [i for i in range(len(self.actions)) if q[i] == maxQ]
i = random.choice(best)
explore = False
else:
i = q.index(maxQ)
explore = False
newaction = self.actions[i]
# Record decision for k and t in history
self.history[t] = (self.state, self.action)
return newaction, explore
# Calculate the temporal difference error
def TDError(self, k, t):
if t == 10:
delta = self.reward + self.discount * 0 - self.getQ(k, t, self.state, self.action)
else:
delta = self.reward + self.discount * self.getQ(k, t+1, self.nextState,
self.nextAction) - self.getQ(k, t, self.state, self.action)
return delta
# Update the eligibility trace for the current state
def updateCurrentET(self, k, t, state, action):
self.et[(k, t)][(state, action)] = 1
# Update the Q values for the next episode for all state-action pairs used during this episode
def updateQs(self, k, t):
for i in range(t, 0, -1):
self.q[(k+1, t)][self.history[i]] = self.q[(k, i)].get(self.history[i], 0) + \
(1 / float(1 + self.visits.get(self.history[i], 0))) * self.tdError * \
self.et[t].get(self.history[i], 0)
# Update the eligibility traces for all state-action pairs used in the episode
def updateETs(self, k, t):
# Set all to zero if exploration
if self.explore:
for i in range(t, 0, -1):
self.et[t+1][self.history[i]] = 0
else:
for i in range(t-1, 0, -1):
self.et[t+1][self.history[i]] = self.lamb * self.et[t].get(self.history[i], 0.0)
# Assign the next state and action to the current state and action
def stepForward(self):
self.state = self.nextState
self.action = self.nextAction
以下是我作为模拟使用的内容。
import random
from qlearn.qlearn import QLearn, AutoVivification
# This simulates the process by which sales are realized and applies the Q-Learning algorithm to it
# Each series of episodes is repeated 1000 times in order to find the average revenue realized across all episodes
def main():
# Define periods and episodes, instantiate the agent
periods = 10
episodes = 200
agent = QLearn()
revHistory = []
firstSold = getSold(1,agent.action,agent.state)
for k in range(1,episodes+1,1):
revenue = 0
# Process for first episode
if k == 1:
for t in range(1,periods+1,1):
# Process for first period and episode
if t == 1:
revenue = revenue + firstPeriodandEpisode(agent, firstSold)
# Process for last period
elif t == periods:
revenue = revenue + lastPeriod(agent, k, t)
# Process for all other periods
else:
revenue = revenue + update(agent, k, t)
else:
for t in range(1,periods+1,1):
# Process for first period
if t == 1:
revenue = revenue + firstPeriod(agent, k, t)
# Process for last period
elif t == periods:
revenue = revenue + lastPeriod(agent, k, t)
# Process for all other periods
else:
revenue = revenue + update(agent, k, t)
# Updates number of visits to each state-action pair at the end each episode
for i in range(1,periods+1,1):
agent.visits[agent.history[i]] = agent.visits.get(agent.history[i],0) + 1
revHistory.append(revenue)
# Returns the final revenue realized
return revHistory[-1]
def firstPeriodandEpisode(agent, sold):
# Creates dictionaries for eligibility traces and Q values
agent.et = AutoVivification()
agent.q = AutoVivification()
# Gets the next state based on number sold
agent.nextState = agent.getNextState(sold)
# Calculates the reward by multiplying the number sold by the price
agent.reward = agent.getReward(sold)
# Chooses the next action by finding the maximum Q value
agent.nextAction, agent.explore = agent.chooseAction(k=1, t=1)
# Calculates the temporal difference error for the action
agent.tdError = agent.TDError(k=1,t=1)
# Assigns the number 1 to the current eligibility trace for the current state and action
agent.et[1][(agent.state,agent.action)] = 1
# Updates all Q values
agent.updateQs(k=1,t=1)
# Updates all eligibility traces
agent.updateETs(k=1,t=1)
# Advances time one step
agent.stepForward()
return agent.reward
def update(agent, k, t):
# Gets number sold based on the equation in getSold()
periodSold = getSold(t, agent.action, agent.state)
# Gets the next state based on number sold
agent.nextState = agent.getNextState(periodSold)
# Calculates the reward by multiplying the number sold by the price
agent.reward = agent.getReward(periodSold)
# Chooses the next action by finding the maximum Q value
agent.nextAction, agent.explore = agent.chooseAction(k, t)
# Calculates the temporal difference error for the action
agent.tdError = agent.TDError(k,t)
# Assigns the number 1 to the current eligibility trace for the current state and action
agent.et[t][(agent.state,agent.action)] = 1
# Updates all Q values
agent.updateQs(k,t)
# Updates all eligibility traces
agent.updateETs(k,t)
# Advances time one step
agent.stepForward()
return agent.reward
def lastPeriod(agent, k, t):
# Gets number sold based on the equation in getSold()
periodSold = getSold(t, agent.action, agent.state)
# Gets the next state based on number sold
agent.nextState = agent.getNextState(periodSold)
# Calculates the reward by multiplying the number sold by the price
agent.reward = agent.getReward(periodSold)
# Chooses the next action by finding the maximum Q value
agent.nextAction, agent.explore = agent.chooseAction(k, t)
# Calculates the temporal difference error for the action
agent.tdError = agent.TDError(k,t)
# Assigns the number 1 to the current eligibility trace for the current state and action
agent.et[t][(agent.state,agent.action)] = 1
# Updates all Q values
agent.updateQs(k,t)
# Updates all eligibility traces
agent.updateETs(k,t)
# Gets the action for the beginning of the next episode and makes it the current state
agent.action, agent.explore = agent.nextEpisodeAction(k)
# Sets current state to the beginning state
agent.state = 15
return agent.reward
def firstPeriod(agent, k, t):
# All eligibility traces to 0
agent.et = AutoVivification()
# Clear state-action history
agent.history = {}
# Gets number sold based on the equation in getSold()
periodSold = getSold(t, agent.action, agent.state)
# Gets the next state based on number sold
agent.nextState = agent.getNextState(periodSold)
# Calculates the reward by multiplying the number sold by the price
agent.reward = agent.getReward(periodSold)
# Chooses the next action by finding the maximum Q value
agent.nextAction, agent.explore = agent.chooseAction(k, t)
# Calculates the temporal difference error for the action
agent.tdError = agent.TDError(k,t)
# Assigns the number 1 to the current eligibility trace for the current state and action
agent.et[t][(agent.state,agent.action)] = 1
# Updates all Q values
agent.updateQs(k,t)
# Updates all eligibility traces
agent.updateETs(k,t)
# Advances time one step
agent.stepForward()
return agent.reward
# Gets the number sold base on the time period, the action, and the state
def getSold(t, action, state):
if state == 0:
sold = 0
else:
# Equation that simulates the amount sold
sold = .3 + .25*t - .014*action
if sold < 0:
sold = 0
return int(round(sold,0))
if __name__ == "__main__":
simRev = []
for i in range(1,1001,1):
simRev.append(main())
# Prints the average final revenue over all simulations
print reduce(lambda x, y: x + y, simRev) / float(len(simRev))
它最终吐出的值应该接近150,因为这是它应该找到的最大值,但由于某种原因,无论我尝试什么,它都会被困在80左右。