我正在尝试将FrozenLake替代环境应用到我的旨在优化收益的简单项目中。我的项目有300个州(未售出的席位数量)和15个动作(席位价格变化)。
我试图自我。在代码中某些位置的前缀。
import gym
import numpy as np
import time, pickle, os
env = gym.make('FrozenLake-v0')
env.observation_space.n=300
env.action_space.n=15
epsilon = 0.9
total_episodes = 1000
max_steps = 50
lr_rate = 0.81
gamma = 0.96
Q = np.zeros((env.observation_space.n, env.action_space.n))
print(Q)
def choose_action(state):
action=0
if np.random.uniform(0, 1) < epsilon:
action = env.action_space.sample()
else:
action = np.argmax(Q[state, :])
return action
def learn(state, state2, reward, action, action2):
predict = Q[state, action]
target = reward + gamma * Q[state2, action2]
Q[state, action] = Q[state, action] + lr_rate * (target - predict)
# Start
rewards=0
for episode in range(total_episodes):
t = 0
state = env.reset()
action = choose_action(state)
while t < max_steps:
state2, reward, done, info = env.step(action)
action2 = choose_action(state2)
learn(state, state2, reward, action, action2)
print("these are env.render(state) and env.render(state2)")
env.render(state)
env.render(state2)
state = state2
action = action2
t += 1
rewards+=1
print("Q table for episode",episode,"and time",t,"is:")
print(Q)
if done:
break
# epsilon = min_epsilon + (max_epsilon - min_epsilon) * np.exp(-decay_rate * episode)
# os.system('clear')
time.sleep(0.1)
print ("Score over time: ", rewards/total_episodes)
print(Q)
它给出:
KeyError Traceback (most recent call last)
<ipython-input-5-4d1e8742ec38> in <module>()
55 while t < max_steps:
56
---> 57 state2, reward, done, info = env.step(action)
58
59
我希望获得一个不同于全零的Q表。