我正在尝试使用深度q学习解决一个简单的游戏(通过TensorFlow DQN代理和自定义环境实现)。
游戏由10x10网格上的单个玩家组成,该玩家可以向上/向下/向左/向右移动或保持所有这一切的费用为HP。
玩家在游戏开始时具有一定数量的HP。在游戏中,食物是随机产生的,玩家应该学会拾起食物并生存。食物不动,只是静止不动。
方法:
玩家每存活一转都会获得0.1奖励。
状态以大小为4的元组数组表示。
每个元组代表一个有机体,并定义如下:(x,y,hp,类型)。
第一个元组始终是玩家,其他的元组是猎物(食物)。
我复制了TensorFlow提供的DQN教程中的代码-link
问题:
开始时的网络损耗非常小,经过几次迭代后,网络损耗变为0.0,大部分时间保持为0。
我认为这导致代理无法学习任何体面的策略-代理经过40000次迭代后获得的最佳成绩比仅仅停滞不前要差。
我没有更改训练代理的代码中的任何重要内容,只是使用了教程中给出的相同代码。
这是游戏环境的代码:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import random
import collections
import abc
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from tf_agents.environments import py_environment
from tf_agents.environments import tf_environment
from tf_agents.environments import tf_py_environment
from tf_agents.environments import utils
from tf_agents.specs import array_spec
from tf_agents.environments import wrappers
from tf_agents.environments import suite_gym
from tf_agents.trajectories import time_step as ts
from tf_agents.agents.dqn import dqn_agent
from tf_agents.drivers import dynamic_step_driver
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.eval import metric_utils
from tf_agents.metrics import tf_metrics
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.trajectories import trajectory
from tf_agents.utils import common
from copy import deepcopy
tf.compat.v1.enable_v2_behavior()
DISCOUNT_FACTOR = 0.9
board_size = 10
starting_count = 1
input_layer_organism_count = 1
max_moves = 500
GREEN_TYPE = 0
RED_PLAYER = 1
start_energy = 20
green_organisms_energy = 5
class Organism:
id = -1
x = y = -1
energy = 0
type = -1
def __init__(self,x,y,energy,type):
self.id = np.random.randint(-10000000,10000000)
self.x = x; self.y = y
self.energy = energy
self.type = type
def to_array(self):
attributes_array = [self.x,self.y,self.energy,self.type]
return attributes_array
class GameEnv(py_environment.PyEnvironment):
NO_ORGANISM = Organism(x=-1, y=-1, energy=-1, type=-1)
number_of_attributes = 4 # Number of attributes / properties required to describe an organism
move_number = 0
total_expended_energy = 0.0
player = NO_ORGANISM
green_organisms = []
def __generate_random_organism(self, organism_energy, organism_type):
new_organism = Organism(x=random.randint(0,board_size), y=random.randint(0,board_size),
energy=organism_energy, type=organism_type)
return new_organism
def __generate_random_organisms(self, count, organism_energy, organism_list, organism_type):
for _ in range(count):
new_organism = self.__generate_random_organism(organism_energy, organism_type)
organism_list.append(new_organism)
def get_reward(self):
if self.player.energy > 0:
return 1.0
return 0.0
def get_current_state_as_array(self):
state = self.player.to_array()
def pick_closest_organisms(organisms, count, current_organism):
organism_distance_map = []
for other_organism in organisms:
if other_organism.id != current_organism.id:
distance = self.get_distance(other_organism, current_organism)
organism_distance_map.append((distance,other_organism))
sorted_distances = sorted(organism_distance_map, key=lambda x: x[0])
closest_organisms = []
while len(sorted_distances) < count:
sorted_distances.append((0, self.NO_ORGANISM))
for i in range(count):
closest_organisms.append(sorted_distances[i][1])
return closest_organisms
closest_organisms = pick_closest_organisms(self.green_organisms,int(input_layer_organism_count),self.player)
for organism in closest_organisms:
state += organism.to_array()
return np.asarray(state,dtype=np.double)
def __init__(self):
self._action_spec = array_spec.BoundedArraySpec(
shape=(), dtype=np.int32, minimum=0, maximum=4, name='action')
self._observation_spec = array_spec.BoundedArraySpec(
shape=((input_layer_organism_count+1)*self.number_of_attributes,), dtype=np.double, minimum=-1, name='observation')
self.green_organisms.clear()
self.player = self.__generate_random_organism(start_energy, RED_PLAYER)
self._state = self.get_current_state_as_array()
self._episode_ended = False
self.move_number = 0
self.total_expended_energy = 0.0
def action_spec(self):
return self._action_spec
def observation_spec(self):
return self._observation_spec
def _reset(self):
self.green_organisms.clear()
self.player = self.__generate_random_organism(start_energy, RED_PLAYER)
self._state = self.get_current_state_as_array()
self._episode_ended = False
self.move_number = 0
self.total_expended_energy = 0.0
return ts.restart(self._state)
def is_game_over(self):
if self.move_number >= max_moves:
return True
if self.player.energy <= 0:
return True
return False
def get_distance(self, organism1, organism2):
dx = abs(organism1.x - organism2.x)
dy = abs(organism1.y - organism2.y)
if dx > 0.5*board_size:
dx = board_size - dx
if dy > 0.5*board_size:
dy = board_size - dy
return np.sqrt(dx*dx + dy*dy)
def move_organism(self, organism, move_action):
# Moves are coded as follows: 0 - stay, 1 - left, 2 - right, 3 - up, 4 - down
# All moves except stay cost 2 energy, stay costs 1 energy
#Update energy
new_energy = organism.energy
if move_action == 0:
new_energy -= 1
self.total_expended_energy += 1
else:
new_energy -= 2
self.total_expended_energy += 2
#Update position
new_x = organism.x
new_y = organism.y
if move_action == 1:
new_x = (organism.x - 1) % board_size
elif move_action == 2:
new_x = (organism.x + 1) % board_size
elif move_action == 3:
new_y = (organism.y - 1) % board_size
elif move_action == 4:
new_y = (organism.y + 1) % board_size
new_organism = deepcopy(organism)
new_organism.x = new_x; new_organism.y = new_y
new_organism.energy = new_energy
return new_organism
def _step(self, action):
if self._episode_ended:
# The last action ended the episode. Ignore the current action and start
# a new episode.
return self.reset()
if self.is_game_over():
self.episode_ended = True
return ts.termination(self.get_current_state_as_array(),0)
else:
number_of_new_green_organisms = np.floor(self.total_expended_energy/green_organisms_energy)
self.total_expended_energy -= number_of_new_green_organisms * green_organisms_energy
self.__generate_random_organisms(int(number_of_new_green_organisms), green_organisms_energy,
self.green_organisms, GREEN_TYPE)
self.move_number += 1
# Move the player
self.player = self.move_organism(self.player,action)
# Check if the player should consume a green organism
for i in range(len(self.green_organisms)):
other_organism = self.green_organisms[i]
if self.get_distance(other_organism, self.player) <= 1.01:
self.player.energy += other_organism.energy
del self.green_organisms[i]
break
if self.is_game_over():
self.episode_ended = True
return ts.termination(self.get_current_state_as_array(),0)
return ts.transition(self.get_current_state_as_array(), self.get_reward(), discount = DISCOUNT_FACTOR)
训练时亏损:
step = 5800: loss = 0.0
step = 5820: loss = 0.0
step = 5840: loss = 0.0
step = 5860: loss = 0.0
step = 5880: loss = 0.0
step = 5900: loss = 0.0
step = 5920: loss = 0.0
step = 5940: loss = 0.0
step = 5960: loss = 0.0
step = 5980: loss = 0.0
step = 6000: loss = 0.0