我使用 keras 框架在 Python 中实现了深度 q 学习,以重现论文的结果。但是,它不起作用。这是一些信息:
def build_dqn(lr, n_actions, input_dims, fc1_dims, fc2_dims,CLIP_GRADIENT=1):
#set_seed(42)
model = Sequential([
Dense(fc1_dims, input_shape=(input_dims,)), # bias_regularizer=regularizers.l2(1e-4),activity_regularizer=regularizers.l2(1e-5)
Activation('relu'),
BatchNormalization(),
Dense(fc2_dims),
Activation('relu'),
BatchNormalization(),
Dense(n_actions)])
model.compile(optimizer=Adam(lr=lr, clipvalue=CLIP_GRADIENT), loss='mse')
return model
class Agent(object):
def __init__(self, alpha, gamma, n_actions, epsilon, batch_size,
input_dims, epsilon_dec=0.996, epsilon_end=0.01,
mem_size=1000000, fname='dqn_model.h5'):
self.action_space = [i for i in range(n_actions)]
self.gamma = gamma
self.epsilon = epsilon
self.epsilon_dec = epsilon_dec
self.epsilon_min = epsilon_end
self.batch_size = batch_size
self.model_file = fname
self.memory = ReplayBuffer(mem_size, input_dims, n_actions,
discrete=True)
self.q_eval = build_dqn(alpha, n_actions, input_dims, 64, 32)
def remember(self, state, action, reward, new_state, done):
self.memory.store_transition(state, action, reward, new_state, done)
def choose_action(self, state):
state = state[np.newaxis, :]
rand = np.random.random()
if rand < self.epsilon:
action = np.random.choice(self.action_space)
else:
actions = self.q_eval.predict(state)
action = np.argmax(actions)
return action
def learn(self):
if self.memory.mem_cntr > self.batch_size:
state, action, reward, new_state, done = \
self.memory.sample_buffer(self.batch_size)
action_values = np.array(self.action_space, dtype=np.int8)
action_indices = np.dot(action, action_values)
q_eval = self.q_eval.predict(state)
q_next = self.q_eval.predict(new_state)
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32)
q_target[batch_index, action_indices] = reward + \
self.gamma*np.max(q_next, axis=1)*done
_ = self.q_eval.fit(state, q_target, verbose=0)
self.epsilon = self.epsilon*self.epsilon_dec if self.epsilon > \
self.epsilon_min else self.epsilon_min
def processState(self, state):
n = len(state)
relative_diff_matrix,prev_posiion = state[:n-1],state[n-1]
relative_diff_matrix = relative_diff_matrix.reshape((int(n/30),30))
relative_diff_matrix = np.diff(relative_diff_matrix) / relative_diff_matrix[:,:-1]
relative_diff_matrix = StandardScaler().fit_transform(relative_diff_matrix.T).T
processed_state = relative_diff_matrix.flatten()
processed_state = np.append(processed_state,prev_posiion)
return processed_state
def processReward(self, reward,rewardClipping=1):
return np.clip(reward, -rewardClipping, rewardClipping)
def train_model(self,trainingEnv, n_episodes = 1,verbose=0):
scores = []
eps_history = []
for i in range(n_episodes):
done = False
score = 0
observation = env.reset()
observation = self.processState(observation)
#observation = self.processState(observation)
while not done:
action = agent.choose_action(observation)
observation_, reward, done, info = trainingEnv.step(action)
# Remembering episode
reward = self.processReward(reward)
observation_ = self.processState(observation_)
score += reward
self.remember(observation_, action, reward, observation_, int(done))
# Remembering episode for other action => Better exploration
otherAction = int(not bool(action))
otherReward = self.processReward(info['Reward'])
otherNextState = self.processState(info['State'])
otherDone = info['Done']
self.remember(observation_, otherAction, otherReward, otherNextState, otherDone)
observation = observation_
# learning
self.learn()
if verbose :
eps_history.append(agent.epsilon)
scores.append(score)
avg_score = np.mean(scores[max(0, i-100):(i+1)])
print('episode: ', i,'score: %.2f' % score,
' average score %.2f' % avg_score)
trainingEnv.render()
def save_model(self):
self.q_eval.save(self.model_file)
def load_model(self):
self.q_eval = load_model(self.model_file)
我从 100 美元的资本开始,然后在 20 年(大约 10000 步)的范围内或多或少地完成。我尝试调整参数但没有任何效果。
这里是主要的:
env = TradingEnv(marketSymbol="GOOGL", period=PERIOD_DEFAULT, startingDate=START_DEFAULT, endingDate=END_DEFAULT, columns=COLUMNS, money=100,transactionCosts=0)
lr = 0.0005
agent = Agent(gamma=1, epsilon=0.00, alpha=lr, input_dims=117,
n_actions=2, mem_size=1000000, batch_size=32, epsilon_end=0.0)
agent.train_model(env)
答案 0 :(得分:0)
我想我已经设法解决了这个问题。我们需要将剧集数量设置为足够高的数字(不是 1),在我的例子中只有 30。但是,我不知道如何有效地回测深度 q 交易代理!