import tensorflow as tf
import gym
from tensorflow.keras import Sequential,Input
from tensorflow.keras.layers import Dense
from keras import backend as K
import numpy as np
def NN(inputs,outputs,lr):
model=Sequential()
model.add(Input(inputs))
model.add(Dense(256,activation='relu'))
model.add(Dense(128,activation='relu'))
model.add(Dense(outputs))
model.compile(optimizer=tf.optimizers.Adam(lr=lr),loss='mse')
return model
env=gym.make('MountainCar-v0')
observation_space=2
action_space=3
gamma=0.95
learning_rate=0.05
policy=NN(observation_space,action_space,learning_rate)
reward_mem=[]
action_mem=[]
state_mem=[]
num_games=1000
for i in range(num_games):
initial=env.reset()
reward_sum=0
done=False
while not done:
act=np.argmax(policy.predict(np.expand_dims(initial,axis=0)))
obs,reward,done,_=env.step(act)
reward_mem.append(reward)
state_mem.append(initial)
action_mem.append(tf.one_hot(act,action_space,off_value=1,on_value=2))
reward_sum += reward
initial=obs
G=0
print(reward_sum)
for step in range(len(reward_mem)):
G +=reward_mem[step]*np.power(gamma,step)
G -= np.mean(G)
out= -K.log(tf.convert_to_tensor(action_mem,dtype=np.float32)) *G
#print(out.shape)
policy.train_on_batch(tf.convert_to_tensor(state_mem),out)
#print('Printing Out')
#print(out)
state_mem=[]
reward_mem=[]
action_mem=[]
好的,我已经尝试过运行1000次以上的迭代代码,而代理根本不想学习。我是强化学习的新手,我真的想对我在这里做错的事情有一些见解。我知道我可能搞砸了日志丢失功能,并且看到了一些代码示例,但是有些只是没有意义。请告诉我在这里我可能做错了什么。我想创建尽可能简单的代码,因此要编写这么短的代码。 输出仅为-200到1000倍。这根本没有学习。