训练DQN时损失为0

时间:2020-07-31 13:25:59

标签: python tensorflow reinforcement-learning loss-function q-learning

我正在尝试使用深度q学习解决一个简单的游戏(通过TensorFlow DQN代理和自定义环境实现)。
游戏由10x10网格上的单个玩家组成,该玩家可以向上/向下/向左/向右移动或保持所有这一切的费用为HP。 玩家在游戏开始时具有一定数量的HP。在游戏中,食物是随机产生的,玩家应该学会拾起食物并生存。食物不动,只是静止不动。
方法:
玩家每存活一转都会获得0.1奖励。
状态以大小为4的元组数组表示。
每个元组代表一个有机体,并定义如下:(x,y,hp,类型)。 第一个元组始终是玩家,其他的元组是猎物(食物)。
我复制了TensorFlow提供的DQN教程中的代码-link
问题:
开始时的网络损耗非常小,经过几次迭代后,网络损耗变为0.0,大部分时间保持为0。 我认为这导致代理无法学习任何体面的策略-代理经过40000次迭代后获得的最佳成绩比仅仅停滞不前要差。
我没有更改训练代理的代码中的任何重要内容,只是使用了教程中给出的相同代码。
这是游戏环境的代码:

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import random
import collections
import abc
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from copy import deepcopy

tf.compat.v1.enable_v2_behavior()

DISCOUNT_FACTOR = 0.9

board_size = 10
starting_count = 1
input_layer_organism_count = 1
max_moves = 500

GREEN_TYPE = 0
RED_PLAYER = 1

start_energy = 20
green_organisms_energy = 5
class Organism:
  id = -1
  x = y = -1
  energy = 0
  type = -1
  def __init__(self,x,y,energy,type):
    self.id = np.random.randint(-10000000,10000000)
    self.x = x; self.y = y
    self.energy = energy
    self.type = type

  def to_array(self):
    attributes_array =  [self.x,self.y,self.energy,self.type]
    return attributes_array
class GameEnv(py_environment.PyEnvironment):
  NO_ORGANISM = Organism(x=-1, y=-1, energy=-1, type=-1)
  number_of_attributes = 4 # Number of attributes / properties required to describe an organism
  move_number = 0
  total_expended_energy = 0.0
  
  player = NO_ORGANISM
  green_organisms = []
  
  def __generate_random_organism(self, organism_energy, organism_type):
    new_organism = Organism(x=random.randint(0,board_size), y=random.randint(0,board_size), 
        energy=organism_energy, type=organism_type)
    return new_organism
  
  def __generate_random_organisms(self, count, organism_energy, organism_list, organism_type):
    for _ in range(count):
        new_organism = self.__generate_random_organism(organism_energy, organism_type)
        organism_list.append(new_organism)

  def get_reward(self):
    if self.player.energy > 0:
      return 1.0
    return 0.0

  def get_current_state_as_array(self):
    state = self.player.to_array()

    def pick_closest_organisms(organisms, count, current_organism):
      organism_distance_map = []
      
      for other_organism in organisms:
        if other_organism.id != current_organism.id:
          distance = self.get_distance(other_organism, current_organism)
          organism_distance_map.append((distance,other_organism))
          
      sorted_distances = sorted(organism_distance_map, key=lambda x: x[0])
      
      closest_organisms = []
      while len(sorted_distances) < count:
        sorted_distances.append((0, self.NO_ORGANISM))
      for i in range(count):
        closest_organisms.append(sorted_distances[i][1])
        
      return closest_organisms

    closest_organisms = pick_closest_organisms(self.green_organisms,int(input_layer_organism_count),self.player)
    for organism in closest_organisms:
      state += organism.to_array()
    return np.asarray(state,dtype=np.double)

  def __init__(self):
    self._action_spec = array_spec.BoundedArraySpec(
        shape=(), dtype=np.int32, minimum=0, maximum=4, name='action')
    self._observation_spec = array_spec.BoundedArraySpec(
        shape=((input_layer_organism_count+1)*self.number_of_attributes,), dtype=np.double, minimum=-1, name='observation')

    self.green_organisms.clear()    
    self.player = self.__generate_random_organism(start_energy, RED_PLAYER)

    self._state = self.get_current_state_as_array()
    self._episode_ended = False
    self.move_number = 0
    self.total_expended_energy = 0.0

  def action_spec(self):
    return self._action_spec

  def observation_spec(self):
    return self._observation_spec

  def _reset(self):
    self.green_organisms.clear()
    self.player = self.__generate_random_organism(start_energy, RED_PLAYER)
    
    self._state = self.get_current_state_as_array()
    self._episode_ended = False
    self.move_number = 0
    self.total_expended_energy = 0.0
    return ts.restart(self._state)

  def is_game_over(self):
    if self.move_number >= max_moves:
        return True
    if self.player.energy <= 0:
        return True
    return False

  def get_distance(self, organism1, organism2):
    dx = abs(organism1.x - organism2.x)
    dy = abs(organism1.y - organism2.y)
    
    if dx > 0.5*board_size:
      dx = board_size - dx
    if dy > 0.5*board_size:
      dy = board_size - dy
      
    return np.sqrt(dx*dx + dy*dy)

  def move_organism(self, organism, move_action):
    # Moves are coded as follows: 0 - stay, 1 - left, 2 - right, 3 - up, 4 - down
    # All moves except stay cost 2 energy, stay costs 1 energy
    
    #Update energy
    new_energy = organism.energy
    if move_action == 0:
      new_energy -= 1
      self.total_expended_energy += 1
    else:
      new_energy -= 2
      self.total_expended_energy += 2
      
    #Update position
    new_x = organism.x
    new_y = organism.y
    if move_action == 1:
      new_x = (organism.x - 1) % board_size
    elif move_action == 2:
      new_x = (organism.x + 1) % board_size
    elif move_action == 3:
      new_y = (organism.y - 1) % board_size
    elif move_action == 4:
      new_y = (organism.y + 1) % board_size
    
    new_organism = deepcopy(organism)
    new_organism.x = new_x; new_organism.y = new_y
    new_organism.energy = new_energy
    return new_organism    

  def _step(self, action):
    if self._episode_ended:
      # The last action ended the episode. Ignore the current action and start
      # a new episode.
      return self.reset()
    
    if self.is_game_over():
      self.episode_ended = True
      return ts.termination(self.get_current_state_as_array(),0)
    else:
      number_of_new_green_organisms = np.floor(self.total_expended_energy/green_organisms_energy)
      self.total_expended_energy -= number_of_new_green_organisms * green_organisms_energy
      self.__generate_random_organisms(int(number_of_new_green_organisms), green_organisms_energy,
                                    self.green_organisms, GREEN_TYPE)
      
      self.move_number += 1
            
      # Move the player
      self.player = self.move_organism(self.player,action)
      
      # Check if the player should consume a green organism
      for i in range(len(self.green_organisms)):
        other_organism = self.green_organisms[i]
        if self.get_distance(other_organism, self.player) <= 1.01:
          self.player.energy += other_organism.energy
          del self.green_organisms[i]
          break
      
    if self.is_game_over():
      self.episode_ended = True
      return ts.termination(self.get_current_state_as_array(),0)
    return ts.transition(self.get_current_state_as_array(), self.get_reward(), discount = DISCOUNT_FACTOR)

训练时亏损:

step = 5800: loss = 0.0                                                                                                                                
step = 5820: loss = 0.0                                                                                                                                
step = 5840: loss = 0.0                                                                                                                                
step = 5860: loss = 0.0                                                                                                                                
step = 5880: loss = 0.0                                                                                                                                
step = 5900: loss = 0.0                                                                                                                                
step = 5920: loss = 0.0                                                                                                                                
step = 5940: loss = 0.0                                                                                                                                
step = 5960: loss = 0.0                                                                                                                                
step = 5980: loss = 0.0                                                                                                                                
step = 6000: loss = 0.0

0 个答案:

没有答案