此代码应使用TF-Agents库在Cartpole环境中训练DQN(深度Q网络)代理,但似乎该代理不能正确训练。我正在尝试使用驱动程序模块编写一个最小的示例。
我还可以从TF-Agents库中运行示例。
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tf_agents.environments import suite_gym
from tf_agents.environments import tf_py_environment
from tf_agents.networks import q_network
from tf_agents.policies import random_tf_policy
from tf_agents.agents.dqn import dqn_agent
from tf_agents.replay_buffers import tf_uniform_replay_buffer
from tf_agents.drivers import dynamic_step_driver
from tf_agents.metrics import tf_metrics
from tf_agents.eval import metric_utils
tf.compat.v1.enable_v2_behavior()
# parameter
env_name = 'CartPole-v0'
num_iterations = 20000
collect_steps_per_iteration = 1
initial_steps = 1000
replay_buffer_capacity = 100000
batch_size = 64
learning_rate = 0.001
fc_layer_params = (50, )
# load enviroment
py_train_env = suite_gym.load(env_name)
py_eval_env = suite_gym.load(env_name)
tf_train_env = tf_py_environment.TFPyEnvironment(py_train_env)
tf_eval_env = tf_py_environment.TFPyEnvironment(py_eval_env)
# create agent
q_net = q_network.QNetwork(tf_train_env.observation_spec(), tf_train_env.action_spec(), fc_layer_params=fc_layer_params)
optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=learning_rate)
tf_agent = dqn_agent.DqnAgent(tf_train_env.time_step_spec(), tf_train_env.action_spec(), q_network=q_net,
optimizer=optimizer)
tf_agent.initialize()
# replay Buffer,policies and driver
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(data_spec=tf_agent.collect_data_spec,
batch_size=tf_train_env.batch_size,
max_length=replay_buffer_capacity)
random_policy = random_tf_policy.RandomTFPolicy(tf_train_env.time_step_spec(), tf_train_env.action_spec())
collect_policy = tf_agent.collect_policy
eval_policy = tf_agent.policy
init_driver = dynamic_step_driver.DynamicStepDriver(tf_train_env, random_policy, [replay_buffer.add_batch],
initial_steps)
collect_driver = dynamic_step_driver.DynamicStepDriver(tf_train_env, collect_policy, [replay_buffer.add_batch],
collect_steps_per_iteration)
# collect init data
init_driver.run()
ds = replay_buffer.as_dataset(num_parallel_calls=3, sample_batch_size=batch_size, num_steps=2).prefetch(3)
iterator = iter(ds)
# train agent
print('Train Agent(global steps=' + str(num_iterations*collect_steps_per_iteration) + '):')
tf_train_env.reset()
for i in range(num_iterations):
collect_driver.run()
experience, _ = next(iterator)
train_loss = tf_agent.train(experience)
# evaluate all 100 steps
if ((i+1) * collect_steps_per_iteration) % 100 == 0:
metric = [tf_metrics.AverageReturnMetric()]
result = metric_utils.eager_compute(metric, tf_eval_env, eval_policy, num_episodes=5)
print('step = {0}: loss = {1}: AR = {2}'.format((i+1) * collect_steps_per_iteration, train_loss.loss,
result['AverageReturn'].numpy()))
该代码正在运行,但是特工在训练后无法玩游戏。 另外,我希望平均回报会随着时间的推移而增加,但是它将保持不变。
答案 0 :(得分:0)
:) 试试吧。
#重播缓冲区,策略和驱动程序
replay_buffer = tf_uniform_replay_buffer.TFUniformReplayBuffer(...
replay_buffer_observer = replay_buffer.add_batch
from tf_agents.metrics import tf_metrics
train_metrics = [
tf_metrics.NumberOfEpisodes(),
tf_metrics.EnvironmentSteps(),
tf_metrics.AverageReturnMetric(),
tf_metrics.AverageEpisodeLengthMetric(),
]
dynamic_step_driver.DynamicStepDriver(... observers = [ replay_buffer_observer ] + train_metrics