深度Q网络的损失越来越高

时间:2018-09-08 14:59:52

标签: python tensorflow deep-learning reinforcement-learning q-learning

我试图在Tensorflow中为Atari游戏构建DQN。这是我的代码:

import tensorflow as tf
import gym
import numpy as np
import matplotlib.pyplot as plt
import random
from collections import deque
from skimage import transform
from skimage import io
from skimage.color import rgb2gray
from os import getcwd

class DQN_Agent:
    def DQN(self,x):
        layer1 = tf.layers.conv2d(x,32,5,padding='same',activation=tf.nn.relu)
        layer2 = tf.layers.conv2d(layer1,32,5,padding='same',activation=tf.nn.relu)
        layer3 = tf.layers.flatten(layer2)
        layer4 = tf.layers.dense(layer3,24,tf.nn.relu)
        layer5 = tf.layers.dense(layer4,24,tf.nn.relu)
        layer6 = tf.layers.dense(layer5,self.n_actions,tf.nn.softmax)

        return layer6

    def __init__(self,resize_dim,n_actions,replay_memory_size,history_length):
        self.resize_dim = resize_dim
        self.history_length = history_length
        self.history = deque(maxlen=self.history_length)
        self.n_actions = n_actions
        self.memory = deque(maxlen=replay_memory_size)
        self.learning_rate = 0.001
        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_decay = 0.999
        self.epsilon_min = 0.1

        self.x = tf.placeholder(tf.float32,[None,self.resize_dim,self.resize_dim,self.history_length])
        self.y = tf.placeholder(tf.float32,[None,self.n_actions])
        self.logits = self.DQN(self.x)
        self.loss = tf.losses.softmax_cross_entropy(self.y,self.logits)
        self.optimizer = tf.train.AdamOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
        self.sess = tf.Session()
        self.saver = tf.train.Saver()
        self.init = tf.global_variables_initializer()

        self.sess.run(self.init)

    def preprocess(self,images):
        for i in range(len(images)):
            images[i] = rgb2gray(images[i])
            images[i] = transform.resize(images[i],(self.resize_dim,self.resize_dim))
        images = np.transpose(images,(2,1,0))
        images = transform.rotate(images,-90)
        images = np.array([images])

        return images

    def act(self,x,Testing=False):
        if Testing:
            x = self.preprocess(x)
            Q = self.sess.run(self.logits,feed_dict={self.x:x})
            action = np.argmax(Q[0])

        else:
            if random.random() > self.epsilon:
                x = self.preprocess(x)
                Q = self.sess.run(self.logits,feed_dict={self.x:x})
                action = np.argmax(Q[0])
            else:
                action = random.randrange(0,self.n_actions)

        return action

    def replay(self,batch):
        losses = []
        for state, next_state, reward, action, done in batch:
            state = self.preprocess(state)
            next_state = self.preprocess(next_state)

            target = np.zeros(self.n_actions)

            if not done:
                next_reward = np.amax(self.sess.run(self.logits,feed_dict={self.x:next_state})[0])
                target[action] = reward + next_reward*self.gamma

            l,_ = self.sess.run([self.loss,self.optimizer],feed_dict={self.x:state, self.y:[target]})
            losses.append(l)
        return sum(losses)/len(losses)

    def save_model(self):
        self.saver.save(self.sess,getcwd()+'/model.ckpt')

    def load_model(self):
        self.saver.restore(self.sess,getcwd()+'/model.ckpt')

    def decrease_epsilon(self):
        self.epsilon = max(self.epsilon_min, self.epsilon*self.epsilon_decay)

    def remember(self,state, next_state, reward, action, done):
        self.memory.append((state, next_state, reward, action, done))

    def sample_memory(self,size):
        return random.sample(self.memory,size)

EPISODES = 2000
env = gym.make('Breakout-v0')
n_actions = env.action_space.n
resize_dim = 84
history_length = 3
replay_memory_size = 190000
batch_size = 32
how_often = 25
losses = []
episode_losses = []

agent = DQN_Agent(resize_dim,n_actions,replay_memory_size,history_length)

with tf.device('/device:GPU:0'):
    for episode in range(EPISODES):
        state = env.reset()
        agent.history.append(state)
        state = agent.history
        episode_reward = 0
        while True:
            action = agent.act(state)
            next_state,reward,done,info = env.step(action)
            agent.history.append(next_state)
            next_state = agent.history
            episode_reward += reward
            agent.remember(state, next_state, reward, action, done)
            state = next_state

            if len(agent.memory) >= batch_size:
                batch = agent.sample_memory(batch_size)
                l = agent.replay(batch)
                print('average loss on batch:',l)
                losses.append(l)

            if done:
                print('episode: {}/{}, episode reward: {}, epsilon: {}'.format(episode+1,EPISODES,episode_reward,agent.epsilon))
                break
        agent.decrease_epsilon()

        if (episode+1)%how_often == 0:
            agent.save_model()

        episode_losses.append(sum(losses)/len(losses))

        plt.plot(range(episode+1),episode_losses)
        plt.ylabel('losses')
        plt.xlabel('episodes')
        plt.savefig(getcwd()+'/loss_plot.png')

问题是,在训练网络1000集时,损失从大约0.4变为超过0.9。之后,我尝试训练2000个情节的算法,并将3张图片(而不是1张)馈送到网络,但这并没有改变。这是损失的图表:

此外,当我尝试在突破环境中测试网络时,拨片甚至不会移动。有人可以告诉我如何解决这些问题吗?

0 个答案:

没有答案