政策梯度网络不想学习环境

时间:2020-09-23 09:44:09

标签: python reinforcement-learning policy-gradient-descent

import tensorflow as tf
import gym
from tensorflow.keras import Sequential,Input
from tensorflow.keras.layers import Dense
from keras import backend as K
import numpy as np

def NN(inputs,outputs,lr):
    model=Sequential()
    model.add(Input(inputs))
    model.add(Dense(256,activation='relu'))
    model.add(Dense(128,activation='relu'))
    model.add(Dense(outputs))
    model.compile(optimizer=tf.optimizers.Adam(lr=lr),loss='mse')
    return model


env=gym.make('MountainCar-v0')
observation_space=2
action_space=3
gamma=0.95
learning_rate=0.05
policy=NN(observation_space,action_space,learning_rate)
reward_mem=[]
action_mem=[]
state_mem=[]
num_games=1000
for i in range(num_games):
    initial=env.reset()
    reward_sum=0
    done=False

    while not done:
        act=np.argmax(policy.predict(np.expand_dims(initial,axis=0)))
        obs,reward,done,_=env.step(act)
        reward_mem.append(reward)
        state_mem.append(initial)
        action_mem.append(tf.one_hot(act,action_space,off_value=1,on_value=2))
        reward_sum += reward
        initial=obs
    G=0
    print(reward_sum)


    for step in range(len(reward_mem)):
        G +=reward_mem[step]*np.power(gamma,step)
    G -= np.mean(G)
    out=   -K.log(tf.convert_to_tensor(action_mem,dtype=np.float32)) *G
    #print(out.shape)

    policy.train_on_batch(tf.convert_to_tensor(state_mem),out)
    #print('Printing Out')
    #print(out)
    state_mem=[]
    reward_mem=[]
    action_mem=[]

好的,我已经尝试过运行1000次以上的迭代代码,而代理根本不想学习。我是强化学习的新手,我真的想对我在这里做错的事情有一些见解。我知道我可能搞砸了日志丢失功能,并且看到了一些代码示例,但是有些只是没有意义。请告诉我在这里我可能做错了什么。我想创建尽可能简单的代码,因此要编写这么短的代码。 输出仅为-200到1000倍。这根本没有学习。

0 个答案:

没有答案