这是我的Google Colab笔记本: https://colab.research.google.com/drive/1ed7ZIqi8bWGiJOGQYxne5d2upU2Vq5ea
当我注册4x4地图时,我的解决方案可以正常工作。我现在想解决8x8映射,但是我不知道为什么在使用4x4时我的队列表没有更新。我认为步骤数导致其无法更新,现在不确定。
这是我的代码:
import numpy as np
import gym
import random
import time
from IPython.display import clear_output
from gym.envs.registration import register
register(
id='FrozenLakeNotSlippery-v0',
entry_point='gym.envs.toy_text:FrozenLakeEnv',
kwargs={'map_name' : '8x8', 'is_slippery': False},
max_episode_steps=200,
reward_threshold=0.78, # optimum = .8196
)
direction = {
0: "LEFT",
1: "DOWN",
2: "RIGHT",
3: "UP"
}
env = gym.make("FrozenLakeNotSlippery-v0")
#env = gym.make("FrozenLake8x8-v0")
#env = gym.make("FrozenLake-v0")
action_space_size = env.action_space.n
state_space_size = env.observation_space.n
q_table = np.zeros((state_space_size, action_space_size))
#print(q_table)
num_episodes = 10000
max_steps_per_episode = 500
learning_rate = 0.1
discount_rate = 0.99
exploration_rate = 1
max_exploration_rate = 1
min_exploration_rate = 0.01
exploration_decay_rate = 0.001
rewards_all_episodes = []
# Q-learning algorithm
for episode in range(num_episodes):
state = env.reset()
done = False
rewards_current_episode = 0
for step in range(max_steps_per_episode):
# Exploration-exploitation trade-off
exploration_rate_threshold = random.uniform(0, 1)
if exploration_rate_threshold > exploration_rate:
action = np.argmax(q_table[state, :])
else:
action = env.action_space.sample()
new_position = direction.get(action)
new_state, reward, done, info = env.step(action)
#env.render()
# Update Q-table for Q(s,a)
new_Q_value = q_table[state, action] * (1 - learning_rate) + \
learning_rate * (reward + discount_rate * np.max(q_table[new_state, :]))
q_table[state, action] = new_Q_value
state = new_state
rewards_current_episode += reward
if done == True:
break
# Exploration rate decay
exploration_rate = min_exploration_rate + \
(max_exploration_rate - min_exploration_rate) * np.exp(-exploration_decay_rate * episode)
rewards_all_episodes.append(rewards_current_episode)
# Calculate and print the average reward per thousand episodes
rewards_per_thosand_episodes = np.split(np.array(rewards_all_episodes), num_episodes / 1000)
count = 1000
print("********Average reward per thousand episodes********\n")
for r in rewards_per_thosand_episodes:
print(count, ": ", str(sum(r / 1000)))
count += 1000
# Print updated Q-table
print("\n\n********Q-table********\n")
print(q_table)
input("Press any key to continue")
for episode in range(3):
state = env.reset()
done = False
print("*****EPISODE ", episode + 1, "*****\n\n\n\n")
time.sleep(1)
for step in range(max_steps_per_episode):
clear_output(wait=True)
env.render(mode="human"
"")
time.sleep(0.3)
action = np.argmax(q_table[state, :])
new_state, reward, done, info = env.step(action)
if done:
clear_output(wait=True)
env.render()
if reward == 1:
print("****You reached the goal!****")
time.sleep(3)
else:
print("****You fell through a hole!****")
time.sleep(3)
clear_output(wait=True)
break
state = new_state
env.close()