我目前有点困惑。我实施了一个Actor-Critic网络,并且根据设置专家的意见,要么开始收敛一点,但所产生的价值就大相径庭。或者,它一遍又一遍地产生几乎相同的损耗值,但是从一开始就产生正确的值。 我真的不知道怎么可能。 这是我当前的模型,它产生值但不收敛:
def create_actor_model(self):
state_input = Input(shape=self.observation_space_shape)
h1 = Dense(18, activation='linear')(state_input)
h1l = LeakyReLU(alpha=0.01)(h1)
h3 = Dense(18, activation='tanh')(h1l)
h3n = BatchNormalization()(h3)
output = Dense(self.action_space_shape[0], activation='tanh')(h3n)
model = Model(input=state_input, output=output)
adam = Adam(lr=self.action_space_shape)
model.compile(loss="mse", optimizer=adam)
return state_input, model
def create_critic_model(self):
state_input = Input(shape=self.observation_space_shape)
state_h1 = Dense(18, activation='relu')(state_input)
state_h2 = Dense(36)(state_h1)
action_input = Input(shape=self.action_space_shape)
action_h1 = Dense(36)(action_input)
merged = Add()([state_h2, action_h1])
l_h1 = LeakyReLU(alpha=0.01)(merged)
merged_h1 = Dense(18, activation='tanh')(l_h1)
h1n = BatchNormalization()(merged_h1)
output = Dense(1, activation='tanh')(h1n)
model = Model(input=[state_input, action_input], output=output)
adam = Adam(lr=self.action_space_shape)
model.compile(loss="mse", optimizer=adam, metrics=['mae', 'mse', 'msle'])
return state_input, action_input, model
def _train_actor_batch(self, batch_size, s_batch, a_batch, r_batch, s2_batch):
predicted_action = self.actor_model.predict_on_batch(s_batch)
grads = self.sess.run(self.critic_grads, feed_dict={
self.critic_state_input: s_batch,
self.critic_action_input: predicted_action
})
self.sess.run(self.optimize, feed_dict={
self.actor_state_input: s_batch,
self.actor_critic_grad: grads[0]
})
def _train_critic_batch(self, batch_size, s_batch, a_batch, r_batch, s2_batch):
target_action = self.target_actor_model.predict_on_batch(s2_batch)
future_reward = self.target_critic_model.predict_on_batch([s2_batch, target_action])
rewards = []
for k in range(batch_size):
this_future_reward = future_reward[k] if batch_size > 1 else future_reward
rewards.append(r_batch[k] + self.gamma * this_future_reward)
return self.critic_model.train_on_batch([s_batch, a_batch], np.reshape(rewards, batch_size))
def replay(self, batch_size):
memory_length = len(self.memory)
if memory_length < batch_size:
samples = random.sample(self.memory, memory_length)
else:
samples = random.sample(self.memory, batch_size)
s_batch = np.array([cur_state[0] for cur_state, _, _, _ in samples])
a_batch = np.array([float(action[0]) for _, action, _, _ in samples])
r_batch = np.array([reward[0] for _, _, reward, _ in samples])
s2_batch = np.array([new_state[0] for _, _, _, new_state in samples])
critic_loss = self._train_critic_batch(len(s_batch), s_batch, a_batch, r_batch, s2_batch)
self._train_actor_batch(len(s_batch), s_batch, a_batch, r_batch, s2_batch)
self.update_target()
return critic_loss
def _update_actor_target(self):
actor_model_weights = self.actor_model.get_weights()
actor_target_weights = self.target_actor_model.get_weights()
for i in range(len(actor_target_weights)):
actor_target_weights[i] = actor_model_weights[i] * self.tau + actor_target_weights[i] * (1 - self.tau)
self.target_actor_model.set_weights(actor_target_weights)
def _update_critic_target(self):
critic_model_weights = self.critic_model.get_weights()
critic_target_weights = self.target_critic_model.get_weights()
for i in range(len(critic_target_weights)):
critic_target_weights[i] = critic_model_weights[i] * self.tau + critic_target_weights[i] * (1 - self.tau)
self.target_critic_model.set_weights(critic_target_weights)
def update_target(self):
self._update_actor_target()
self._update_critic_target()
def __init__(self):
self.memory = deque(maxlen=2000)
self.actor_state_input, self.actor_model = self.create_actor_model()
_, self.target_actor_model = self.create_actor_model()
self.actor_critic_grad = tf.placeholder(tf.float32, [None, self.action_space_shape[0]])
actor_model_weights = self.actor_model.trainable_weights
self.actor_grads = tf.gradients(self.actor_model.output,
actor_model_weights, -self.actor_critic_grad)
grads = zip(self.actor_grads, actor_model_weights)
self.optimize = tf.train.AdamOptimizer(self.learning_rate).apply_gradients(grads)
self.critic_state_input, self.critic_action_input, \
self.critic_model = self.create_critic_model()
_, _, self.target_critic_model = self.create_critic_model()
self.critic_grads = tf.gradients(self.critic_model.output,
self.critic_action_input)
self.sess.run(tf.initialize_all_variables())
然后我正在使用以下方法进行训练,该方法针对每个时期(最后,要清除的内存是体验重放内存):
def train(self, states, epoch, env, is_new_epoch):
train_size = int(len(states) * 0.70)
train = dict(list(states.items())[0:train_size])
test = dict(list(states.items())[train_size:len(states)])
with warnings.catch_warnings():
warnings.simplefilter("ignore")
working_states = copy(train)
critic_eval = list()
rewards = dict()
for last_day, (last_state_vec, _, last_action) in working_states.items():
this_day = last_day + timedelta(days=1)
if this_day in working_states:
(new_state_vec, _, _) = working_states.get(this_day)
rewards[last_day] = env.get_reward_by_states(last_state_vec, new_state_vec)
amt = len(working_states)
i = 0
for last_day, (last_state_vec, _, last_action) in working_states.items():
i+= 1
this_day = last_day + timedelta(days=1)
if this_day in working_states:
(new_state_vec, _, _) = working_states.get(this_day)
reward = np.reshape(rewards[last_day], [1, ])
self.remember(last_state_vec, [last_action], reward, new_state_vec)
new_eval = self.replay(env.batch_size)
critic_eval.append(new_eval)
self.memory.clear()
来自内存的一个样本:
state
[8 79 48246 53607 29 34 37 Decimal('1.0000000000') 6]
action
0.85
reward
0.2703302
next state
[9 79 48074 57869 27 28 32 Decimal('1.0000000000') 0]