我正在尝试创建一个可以购买,出售或持有股票头寸的强化学习代理。我遇到的问题是,即使超过2000集,代理商仍然无法了解何时购买,出售或持有。这是第2100集的图片,详细说明了我的意思,除非随机,特工不会采取任何行动。 代理学习使用重播内存,并且我已经两次和三次检查是否没有错误。这是代理的代码: 将numpy导入为np 将tensorflow作为tf导入 随机导入 来自收藏品进口双端队列 从.agent import Agent
class Agent(Agent):
def __init__(self, state_size = 7, window_size = 1, action_size = 3,
batch_size = 32, gamma=.95, epsilon=.95, epsilon_decay=.95, epsilon_min=.01,
learning_rate=.001, is_eval=False, model_name="", stock_name="", episode=1):
"""
state_size: Size of the state coming from the environment
action_size: How many decisions the algo will make in the end
gamma: Decay rate to discount future reward
epsilon: Rate of randomly decided action
epsilon_decay: Rate of decrease in epsilon
epsilon_min: The lowest epsilon can get (limit to the randomness)
learning_rate: Progress of neural net in each iteration
episodes: How many times data will be run through
"""
self.state_size = state_size
self.window_size = window_size
self.action_size = action_size
self.batch_size = batch_size
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_decay = epsilon_decay
self.epsilon_min = epsilon_min
self.learning_rate = learning_rate
self.is_eval = is_eval
self.model_name = model_name
self.stock_name = stock_name
self.q_values = []
self.layers = [150, 150, 150]
tf.reset_default_graph()
self.sess = tf.Session(config=tf.ConfigProto(allow_soft_placement = True))
self.memory = deque()
if self.is_eval:
model_name = stock_name + "-" + str(episode)
self._model_init()
# "models/{}/{}/{}".format(stock_name, model_name, model_name + "-" + str(episode) + ".meta")
self.saver = tf.train.Saver()
self.saver.restore(self.sess, tf.train.latest_checkpoint("models/{}/{}".format(stock_name, model_name)))
# self.graph = tf.get_default_graph()
# names=[tensor.name for tensor in tf.get_default_graph().as_graph_def().node]
# self.X_input = self.graph.get_tensor_by_name("Inputs/Inputs:0")
# self.logits = self.graph.get_tensor_by_name("Output/Add:0")
else:
self._model_init()
self.sess.run(self.init)
self.saver = tf.train.Saver()
path = "models/{}/6".format(self.stock_name)
self.writer = tf.summary.FileWriter(path)
self.writer.add_graph(self.sess.graph)
def _model_init(self):
"""
Init tensorflow graph vars
"""
# (1,10,9)
with tf.device("/device:GPU:0"):
with tf.name_scope("Inputs"):
self.X_input = tf.placeholder(tf.float32, [None, self.state_size], name="Inputs")
self.Y_input = tf.placeholder(tf.float32, [None, self.action_size], name="Actions")
self.rewards = tf.placeholder(tf.float32, [None, ], name="Rewards")
# self.lstm_cells = [tf.contrib.rnn.GRUCell(num_units=layer)
# for layer in self.layers]
#lstm_cell = tf.contrib.rnn.LSTMCell(num_units=n_neurons, use_peepholes=True)
#gru_cell = tf.contrib.rnn.GRUCell(num_units=n_neurons)
# self.multi_cell = tf.contrib.rnn.MultiRNNCell(self.lstm_cells)
# self.outputs, self.states = tf.nn.dynamic_rnn(self.multi_cell, self.X_input, dtype=tf.float32)
# self.top_layer_h_state = self.states[-1]
# with tf.name_scope("Output"):
# self.out_weights=tf.Variable(tf.truncated_normal([self.layers[-1], self.action_size]))
# self.out_bias=tf.Variable(tf.zeros([self.action_size]))
# self.logits = tf.add(tf.matmul(self.top_layer_h_state,self.out_weights), self.out_bias)
fc1 = tf.contrib.layers.fully_connected(self.X_input, 512, activation_fn=tf.nn.relu)
fc2 = tf.contrib.layers.fully_connected(fc1, 512, activation_fn=tf.nn.relu)
fc3 = tf.contrib.layers.fully_connected(fc2, 512, activation_fn=tf.nn.relu)
fc4 = tf.contrib.layers.fully_connected(fc3, 512, activation_fn=tf.nn.relu)
self.logits = tf.contrib.layers.fully_connected(fc4, self.action_size, activation_fn=None)
with tf.name_scope("Cross_Entropy"):
self.loss_op = tf.losses.mean_squared_error(self.Y_input,self.logits)
self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate)
self.train_op = self.optimizer.minimize(self.loss_op)
# self.correct = tf.nn.in_top_k(self.logits, self.Y_input, 1)
# self.accuracy = tf.reduce_mean(tf.cast(self., tf.float32))
tf.summary.scalar("Reward", tf.reduce_mean(self.rewards))
tf.summary.scalar("MSE", self.loss_op)
# Merge all of the summaries
self.summ = tf.summary.merge_all()
self.init = tf.global_variables_initializer()
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done))
def act(self, state):
if np.random.rand() <= self.epsilon and not self.is_eval:
prediction = random.randrange(self.action_size)
if prediction == 1 or prediction == 2:
print("Random")
return prediction
act_values = self.sess.run(self.logits, feed_dict={self.X_input: state.reshape((1, self.state_size))})
if np.argmax(act_values[0]) == 1 or np.argmax(act_values[0]) == 2:
pass
return np.argmax(act_values[0])
def replay(self, time, episode):
print("Replaying")
mini_batch = []
l = len(self.memory)
for i in range(l - self.batch_size + 1, l):
mini_batch.append(self.memory[i])
mean_reward = []
x = np.zeros((self.batch_size, self.state_size))
y = np.zeros((self.batch_size, self.action_size))
for i, (state, action, reward, next_state, done) in enumerate(mini_batch):
target = reward
if not done:
self.target = reward + self.gamma * np.amax(self.sess.run(self.logits, feed_dict = {self.X_input: next_state.reshape((1, self.state_size))})[0])
current_q = (self.sess.run(self.logits, feed_dict={self.X_input: state.reshape((1, self.state_size))}))
current_q[0][action] = self.target
x[i] = state
y[i] = current_q.reshape((self.action_size))
mean_reward.append(self.target)
#target_f = np.array(target_f).reshape(self.batch_size - 1, self.action_size)
#target_state = np.array(target_state).reshape(self.batch_size - 1, self.window_size, self.state_size)
_, c, s = self.sess.run([self.train_op, self.loss_op, self.summ], feed_dict={self.X_input: x, self.Y_input: y, self.rewards: mean_reward}) # Add self.summ into the sess.run for tensorboard
self.writer.add_summary(s, global_step=(episode+1)/(time+1))
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
一旦重播内存大于批处理大小,它将运行重播功能。该代码可能看起来有些混乱,因为我一直想弄清楚它已有好几天了。这是张量板的MSE屏幕截图。 如您所见,第200集时,MSE死亡为0或几乎为0。我很困惑!我不知道发生了什么。请帮我解决这个问题。将代码发布在here上,以查看包括火车和评估文件在内的全部内容。我一直在工作的主要代理是tha代理文件夹中的LSTM.py。谢谢!
答案 0 :(得分:1)
正如该问题的评论所述,这似乎是学习率下降的问题。
本质上,每集,您的学习率都乘以某个因子 j ,这意味着 n 个集/纪元之后的学习率将等于
lr = initial_lr * j ^ n 。在我们的示例中,衰减设置为0.95,这意味着仅经过几次迭代,学习率就已经大大下降。随后,更新将仅执行微小的更正,而不再“学习”非常重要的更改。
这导致了一个问题:为什么衰减完全有意义?一般来说,我们要达到局部最优(可能非常狭窄)。为此,我们尝试使“相对接近”到这样的最小值,然后仅做较小的增量使我们达到此最佳值。如果我们只是继续保持原始的学习速度,那可能是我们每次都只是简单地跳过最佳解决方案,而从未达到我们的目标。 从视觉上看,可以通过以下图形总结问题:
除衰减之外的另一种方法是,一旦算法不再达到任何重要的更新,就简单地将学习速率降低一定量。这样可以避免出现很多情节而纯粹降低学习率的问题。
特别是在您的情况下,较高的衰减值(即较慢的衰减)似乎已经起到了很大作用。
答案 1 :(得分:0)
强化学习中的Q值不代表“奖励”,而是代表“回报”,即当前奖励和未来折扣的总和。当您的模型进入所有零动作的“死角”时,根据您的设置,奖励将为零。然后经过一段时间,您的重播将充满“零的动作会带来零的回报”的记忆,因此,无论您如何更新模型,都无法摆脱这种死胡同。
正如@dennlinger所说,您可以增加epsilon以使模型有一些新的记忆要更新,也可以使用优先级体验重播来训练“有用”的体验。
但是,我建议您首先关注环境本身。您的模型输出零,因为没有更好的选择,对吗?正如您所说的,您在交易股票时,您确定有足够的信息可以导致一种策略,该策略将导致您获得大于零的奖励吗?我认为您需要对此进行任何调整之前,首先要仔细考虑一下。例如,如果股票以纯粹的随机50/50机会上涨或下跌,那么您将永远找不到能够使平均回报大于零的策略。
强化学习代理可能已经找到了最好的代理,尽管这不是您想要的。