我已经创建了以下自定义环境:
class Market(gym.Env):
"""This env is for training a BUYING vwap beating algo, with
OpenAI gym reinforcemnt learning algorithms"""
metadata = {'render.modes': ['human']}
def __init__(self, list_of_df):
super(Market, self).__init__()
self.list_of_df = list_of_df
self.current_day = list_of_df[0]
self.reward_range = (-2147483647, 2147483647)
# self.A_Vol = 0
self.current_step = 0
self.last_ind_in_day = len(list_of_df[0]) - 1
# self.trade_size = 10
self.A_VWAP = 0
self.A_rolling_vol = 0
self.A_rolling_price = 0
self.A_vol_left = 1000
self.reward = 0
self.done = False
# To keep track of the AGENTS VWAP:
self.cum_VbyP = 0
self.cum_vol_traded = 0
self.purchase_vol = 80
self.action_space = spaces.Box(low=np.array([0, 0]), high=np.array([3, 1]), dtype=np.float16)
# Prices contains the OHLC for the 5 min interval
# Miliseconds from midnight
# Rolling VWAP for this time period
# The agents Rolling VWAP, A_VWAP
# The Vol of securities left to still buy, A_trgt_vol
# The Vol traded this time step in the market
self.observation_space = spaces.Box(
low=-2147483647, high=2147483647, shape=(1, len(list_of_df[1].iloc[2])), dtype=np.float16)
def _take_action(self, a):
# Only buy if there are still shares to be bought today,
if (self.A_vol_left > 0):
# Purchase a * volume of a trade
vol = self.purchase_vol * a[0]
print(vol)
# But if there arent enough shares still to buy
if (vol > self.A_vol_left):
vol = self.A_vol_left
self.A_vol_left = self.A_vol_left - vol
# Increase the volume of shares traded:
self.cum_vol_traded = self.cum_vol_traded + vol
if (vol > 0):
# Sample a random price between high and low for this interval:
price = round( random.uniform(self.current_day['Low'].iloc[self.current_step],
self.current_day['High'].iloc[self.current_step]))
# Update cumulative price multiplied by volume:
self.cum_VbyP = self.cum_VbyP + (vol * price)
# Update the Agents VWAP, A_VWAP
self.A_VWAP = self.cum_VbyP / self.cum_vol_traded
def _next_observation(self):
frame = np.array([ self.current_day.iloc[self.current_step]])
frame[:,-1] = self.A_VWAP
frame[:,-2] = self.A_vol_left
return frame
def step(self, action):
# Execute one time step within the environment
print(action)
self._take_action(action)
self.current_step += 1
reward = 0 # always return zero until the last day
if (self.current_step==self.last_ind_in_day):
if(self.A_vol_left<1):
reward = self.current_day['VWAP'].iloc[self.current_step] - self.A_VWAP
else: reward =-999999
self.done = True
obs = self._next_observation()
return obs, reward, self.done, {}
def reset(self):
# Reset the state of the environment to an initial random day
ind = random.randrange(0,len(self.list_of_df))
self.current_day = self.list_of_df[ind]
# Set the current step to a random point within the data frame
self.current_step = 1
# self.last_ind_in_day = len(self.list_of_df[0]) - 1
self.A_VWAP = 0
self.A_rolling_vol = 0
self.A_rolling_price = 0
self.A_vol_left = 1000
self.reward = 0
self.done = False
self.last_ind_in_day = len(self.list_of_df[ind]) - 1
# To keep track of the AGENTS VWAP:
self.cum_VbyP = 0
self.cum_vol_traded = 0
return self._next_observation()
# ====================== End of MARKET class =======================
当我使用PPO2代理程序来遍历此环境时,它总是选择[nan,nan]作为操作。 这是用于训练mdoel的代码
train_env = DummyVecEnv([lambda: Market(train_df)])
test_env = DummyVecEnv([lambda: Market(test_df)])
model = PPO2('MlpLstmPolicy', train_env, nminibatches=1, verbose=0)
n_stepss = 2000
for i in range(1000):
model.learn(n_stepss)
我已在环境中采取措施和步骤方法来添加打印品,以显示正在采取哪些措施,并且这些措施始终如此:
[nan nan]
这是我的github,带有我从Google colab运行的完整的ipython笔记本:
https://github.com/maaxnaax/rl_ipython/blob/master/Copy_of_VWAP_Env.ipynb
答案 0 :(得分:0)
我遇到了与您类似的问题,然后我删除了train_df
和test_df
部分,因为我实际上并不需要它们。然后我的问题就解决了。我想您的问题可能也与数据集有关。
此外,还有一个博客:https://towardsdatascience.com/creating-a-custom-openai-gym-environment-for-stock-trading-be532be3910e,作者在其中遇到的问题与您的类似。