我试图在Tensorflow中为Atari游戏构建DQN。这是我的代码:
import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import deque
from skimage import transform
from skimage import io
from skimage.color import rgb2gray
from os import getcwd
class DQN_Agent:
def DQN(self,x):
layer1 = tf.layers.conv2d(x,32,5,padding='same',activation=tf.nn.relu)
layer2 = tf.layers.conv2d(layer1,32,5,padding='same',activation=tf.nn.relu)
layer3 = tf.layers.flatten(layer2)
layer4 = tf.layers.dense(layer3,24,tf.nn.relu)
layer5 = tf.layers.dense(layer4,24,tf.nn.relu)
layer6 = tf.layers.dense(layer5,self.n_actions,tf.nn.softmax)
return layer6
def __init__(self,resize_dim,n_actions,replay_memory_size,history_length):
self.resize_dim = resize_dim
self.history_length = history_length
self.history = deque(maxlen=self.history_length)
self.n_actions = n_actions
self.memory = deque(maxlen=replay_memory_size)
self.learning_rate = 0.001
self.gamma = 0.99
self.epsilon = 1.0
self.epsilon_decay = 0.999
self.epsilon_min = 0.1
self.x = tf.placeholder(tf.float32,[None,self.resize_dim,self.resize_dim,self.history_length])
self.y = tf.placeholder(tf.float32,[None,self.n_actions])
self.logits = self.DQN(self.x)
self.loss = tf.losses.softmax_cross_entropy(self.y,self.logits)
self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
self.sess = tf.Session()
self.saver = tf.train.Saver()
self.init = tf.global_variables_initializer()
self.sess.run(self.init)
def preprocess(self,images):
for i in range(len(images)):
images[i] = rgb2gray(images[i])
images[i] = transform.resize(images[i],(self.resize_dim,self.resize_dim))
images = np.transpose(images,(2,1,0))
images = transform.rotate(images,-90)
images = np.array([images])
return images
def act(self,x,Testing=False):
if Testing:
x = self.preprocess(x)
Q = self.sess.run(self.logits,feed_dict={self.x:x})
action = np.argmax(Q[0])
else:
if random.random() > self.epsilon:
x = self.preprocess(x)
Q = self.sess.run(self.logits,feed_dict={self.x:x})
action = np.argmax(Q[0])
else:
action = random.randrange(0,self.n_actions)
return action
def replay(self,batch):
losses = []
for state, next_state, reward, action, done in batch:
state = self.preprocess(state)
next_state = self.preprocess(next_state)
target = np.zeros(self.n_actions)
if not done:
next_reward = np.amax(self.sess.run(self.logits,feed_dict={self.x:next_state})[0])
target[action] = reward + next_reward*self.gamma
l,_ = self.sess.run([self.loss,self.optimizer],feed_dict={self.x:state, self.y:[target]})
losses.append(l)
return sum(losses)/len(losses)
def save_model(self):
self.saver.save(self.sess,getcwd()+'/model.ckpt')
def load_model(self):
self.saver.restore(self.sess,getcwd()+'/model.ckpt')
def decrease_epsilon(self):
self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)
def remember(self,state, next_state, reward, action, done):
self.memory.append((state, next_state, reward, action, done))
def sample_memory(self,size):
return random.sample(self.memory,size)
EPISODES = 2000
env = gym.make('Breakout-v0')
n_actions = env.action_space.n
resize_dim = 84
history_length = 3
replay_memory_size = 190000
batch_size = 32
how_often = 25
losses = []
episode_losses = []
agent = DQN_Agent(resize_dim,n_actions,replay_memory_size,history_length)
with tf.device('/device:GPU:0'):
for episode in range(EPISODES):
state = env.reset()
agent.history.append(state)
state = agent.history
episode_reward = 0
while True:
action = agent.act(state)
next_state,reward,done,info = env.step(action)
agent.history.append(next_state)
next_state = agent.history
episode_reward += reward
agent.remember(state, next_state, reward, action, done)
state = next_state
if len(agent.memory) >= batch_size:
batch = agent.sample_memory(batch_size)
l = agent.replay(batch)
print('average loss on batch:',l)
losses.append(l)
if done:
print('episode: {}/{}, episode reward: {}, epsilon: {}'.format(episode+1,EPISODES,episode_reward,agent.epsilon))
break
agent.decrease_epsilon()
if (episode+1)%how_often == 0:
agent.save_model()
episode_losses.append(sum(losses)/len(losses))
plt.plot(range(episode+1),episode_losses)
plt.ylabel('losses')
plt.xlabel('episodes')
plt.savefig(getcwd()+'/loss_plot.png')
问题是,在训练网络1000集时,损失从大约0.4变为超过0.9。之后,我尝试训练2000个情节的算法,并将3张图片(而不是1张)馈送到网络,但这并没有改变。这是损失的图表:
此外,当我尝试在突破环境中测试网络时,拨片甚至不会移动。有人可以告诉我如何解决这些问题吗?