我编写了一个Q-Learning实现,以使用简单的NN解决OpenAI FrozenLake-v0问题。
我的神经网络看起来像这样:
输入层:16
输出层:4
在清晰的张量流中实现非常好。训练1万集后大约完成了70%的集。
此后,我想使用Keras编写相同的算法,但是这次算法的效果非常差,每10k集播完约5%即可完成。
我猜我在Keras实施中犯了错误,但我无法弄清楚。
Tensorflow实现:
import gym
import numpy as np
import tensorflow as tf
env = gym.make('FrozenLake-v0')
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100
tf.reset_default_graph()
inputs = tf.placeholder(shape=[1, 16], dtype=tf.float32)
W = tf.Variable(tf.random_uniform([16, 4], 0, 0.01))
Q = tf.matmul(inputs, W)
predict = tf.argmax(Q, 1)
Qnext = tf.placeholder(shape=[1, 4], dtype=tf.float32)
loss = tf.reduce_sum(tf.square(Qnext - Q))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.1)
updateModel = optimizer.minimize(loss)
init = tf.initialize_all_variables()
rewards_from_episodes = []
with tf.Session() as sess:
sess.run(init)
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(rewards_from_episodes[episode - log_interval: episode]) / log_interval))
W1 = []
for step in range(max_episode_step):
# Select action
action, targetQ = sess.run([predict, Q], feed_dict={inputs: np.identity(16)[observation:observation + 1]})
if np.random.rand(1) < random_action_chance:
action[0] = env.action_space.sample()
new_observation, reward, done, _ = env.step(action[0])
Qnew = sess.run(Q, feed_dict={inputs: np.identity(16)[new_observation:new_observation + 1]})
maxQvalue = np.max(Qnew)
targetQ[0, action[0]] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
_, W1 = sess.run([updateModel, W], feed_dict={inputs: np.identity(16)[observation:observation + 1],
Qnext: targetQ})
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)
print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))
Keras实现:
import gym
import numpy as np
import tensorflow as tf
import random
from tensorflow.python.keras.layers import *
env = gym.make('FrozenLake-v0')
learning_rate = 0.1
discount_rate = 0.99
random_action_chance = 0.1
num_episodes = 10000
max_episode_step = 100
log_interval = 100
model = tf.keras.Sequential()
model.add(Dense(4, kernel_initializer='uniform'))
model.compile(optimizer=tf.train.GradientDescentOptimizer(learning_rate=learning_rate),
loss='mean_squared_error')
rewards_from_episodes = []
for episode in range(num_episodes):
observation = env.reset()
episode_reward = 0
if episode % log_interval == 0 and episode > 0:
print('Episode: {}, Reward: {}'.format(episode, sum(
rewards_from_episodes[episode - log_interval: episode]) / log_interval))
for step in range(max_episode_step):
# Select action
targetQ = model.predict(np.identity(16)[observation:observation + 1], batch_size=1)
action = np.argmax(targetQ)
if random.random() < random_action_chance:
action = env.action_space.sample()
new_observation, reward, done, _ = env.step(action)
Qnew = model.predict(np.identity(16)[new_observation:new_observation + 1], batch_size=1)
maxQvalue = np.max(Qnew)
targetQ[0, action] = reward + discount_rate * maxQvalue
# Train network using target and predicted Q values
model.fit(np.identity(16)[observation:observation + 1], targetQ, epochs=1, batch_size=1, verbose=0)
episode_reward += reward
observation = new_observation
if done:
random_action_chance = 1. / ((episode / 50) + 10)
break
rewards_from_episodes.append(episode_reward)
print("Mean of all episodes: {}%".format(sum(rewards_from_episodes) / num_episodes))