以下是代码：

Question

我目前正在进入tensorflow，并且刚刚开始掌握图形概念。现在我尝试使用梯度下降（Adam optimizer）来实现NN来解决cartpole环境。我首先随机初始化我的重量，然后在训练期间采取随机动作（考虑现有的重量）。在测试时，我总是以最大概率采取行动。但是我总是得到一个徘徊在10左右的分数，差异大约是0.8。总是。它并没有以一种显着的方式改变，使得它看起来总是在每一步都采取纯粹的随机行动，而不是学习任何东西。正如我所说，似乎权重永远不会正确更新。我需要在哪里以及如何做到这一点？

这是我的代码：

import tensorflow as tf
import numpy as np
from gym.envs.classic_control import CartPoleEnv



env = CartPoleEnv()

learning_rate = 10**(-3)
gamma = 0.9999

n_train_trials = 10**3
n_test_trials = 10**2

n_actions = env.action_space.n
n_obs = env.observation_space.high.__len__()

goal_steps = 200

should_render = False

print_per_episode = 100

state_holder = tf.placeholder(dtype=tf.float32, shape=(None, n_obs), name='symbolic_state')
actions_one_hot_holder = tf.placeholder(dtype=tf.float32, shape=(None, n_actions),
                                        name='symbolic_actions_one_hot_holder')
discounted_rewards_holder = tf.placeholder(dtype=tf.float32, shape=None, name='symbolic_reward')

# initialize neurons list dynamically
def get_neurons_list():
    i = n_obs
    n_neurons_list = [i]

    while i < (n_obs * n_actions) // (n_actions // 2):
        i *= 2
        n_neurons_list.append(i)

    while i // 2 > n_actions:
        i = i // 2
        n_neurons_list.append(i)

    n_neurons_list.append(n_actions)

    # print(n_neurons_list)

    return n_neurons_list


with tf.name_scope('nonlinear_policy'):
    # create list of layers with sizes
    n_neurons_list = get_neurons_list()

    network = None

    for i in range((len(n_neurons_list) - 1)):
        theta = tf.Variable(tf.random_normal([n_neurons_list[i], n_neurons_list[i+1]]))
        bias = tf.Variable(tf.random_normal([n_neurons_list[i+1]]))

        if network is None:
            network = tf.matmul(state_holder, theta) + bias
        else:
            network = tf.matmul(network, theta) + bias

        if i < len(n_neurons_list) - 1:
            network = tf.nn.relu(network)

    action_probabilities = tf.nn.softmax(network)

    testing_action_choice = tf.argmax(action_probabilities, dimension=1, name='testing_action_choice')

with tf.name_scope('loss'):
    actually_chosen_probability = action_probabilities * actions_one_hot_holder

    L_theta = -1 * (tf.reduce_sum(tf.log(actually_chosen_probability)) * tf.reduce_sum(discounted_rewards_holder))


with tf.name_scope('train'):
    # We define the optimizer to use the ADAM optimizer, and ask it to minimize our loss
    gd_opt = tf.train.AdamOptimizer(learning_rate).minimize(L_theta)


sess = tf.Session()  # FOR NOW everything is symbolic, this object has to be called to compute each value of Q

# Start

sess.run(tf.global_variables_initializer())

observation = env.reset()
batch_rewards = []
states = []
action_one_hots = []

episode_rewards = []
episode_rewards_list = []
episode_steps_list = []

step = 0
episode_no = 0
while episode_no <= n_train_trials:
    if should_render: env.render()
    step += 1

    action_probability_values = sess.run(action_probabilities,
                                         feed_dict={state_holder: [observation]})
    # Choose the action using the action probabilities output by the policy implemented in tensorflow.
    action = np.random.choice(np.arange(n_actions), p=action_probability_values.ravel())

    # Calculating the one-hot action array for use by tensorflow
    action_arr = np.zeros(n_actions)
    action_arr[action] = 1.
    action_one_hots.append(action_arr)

    # Record states
    states.append(observation)

    observation, reward, done, info = env.step(action)
    # We don't want to go above 200 steps
    if step >= goal_steps:
        done = True

    batch_rewards.append(reward)
    episode_rewards.append(reward)

    # If the episode is done, and it contained at least one step, do the gradient updates
    if len(batch_rewards) > 0 and done:

        # First calculate the discounted rewards for each step
        batch_reward_length = len(batch_rewards)
        discounted_batch_rewards = batch_rewards.copy()
        for i in range(batch_reward_length):
            discounted_batch_rewards[i] *= (gamma ** (batch_reward_length - i - 1))

        # Next run the gradient descent step
        # Note that each of action_one_hots, states, discounted_batch_rewards has the first dimension as the length
        # of the current trajectory
        gradients = sess.run(gd_opt, feed_dict={actions_one_hot_holder: action_one_hots, state_holder: states,
                                                discounted_rewards_holder: discounted_batch_rewards})


        action_one_hots = []
        states = []
        batch_rewards = []

    if done:
        # Done with episode. Reset stuff.
        episode_no += 1

        episode_rewards_list.append(np.sum(episode_rewards))
        episode_steps_list.append(step)

        episode_rewards = []

        step = 0

        observation = env.reset()

        if episode_no % print_per_episode == 0:
            print("Episode {}: Average steps in last {} episodes".format(episode_no, print_per_episode),
                  np.mean(episode_steps_list[(episode_no - print_per_episode):episode_no]), '+-',
                  np.std(episode_steps_list[(episode_no - print_per_episode):episode_no])
                  )


observation = env.reset()

episode_rewards_list = []
episode_rewards = []
episode_steps_list = []

step = 0
episode_no = 0

print("Testing")
while episode_no <= n_test_trials:
    env.render()
    step += 1

    # For testing, we choose the action using an argmax.
    test_action, = sess.run([testing_action_choice],
                            feed_dict={state_holder: [observation]})

    observation, reward, done, info = env.step(test_action[0])
    if step >= 200:
        done = True
    episode_rewards.append(reward)

    if done:
        episode_no += 1

        episode_rewards_list.append(np.sum(episode_rewards))
        episode_steps_list.append(step)

        episode_rewards = []
        step = 0
        observation = env.reset()

        if episode_no % print_per_episode == 0:
            print("Episode {}: Average steps in last {} episodes".format(episode_no, print_per_episode),
                  np.mean(episode_steps_list[(episode_no - print_per_episode):episode_no]), '+-',
                  np.std(episode_steps_list[(episode_no - print_per_episode):episode_no])
                  )

Answer 1

以下是使用Q Learning学习 CartPole Open Gym的示例tensorflow程序。

能够快速学会保持直立80步。

以下是代码：

导入数学导入numpy为np 导入系统随机导入 sys.path.append（＆＃34; ../健身房＆＃34）来自gym.envs.classic_control导入CartPoleEnv env = CartPoleEnv（）

discount = 0.5
learning_rate = 0.5
gradient = .001
regularizaiton_factor = .1

import tensorflow as tf

tf_state    = tf.placeholder( dtype=tf.float32 , shape=[4] )
tf_state_2d    = tf.reshape( tf_state , [1,4] )

tf_action   = tf.placeholder( dtype=tf.int32 )
tf_action_1hot = tf.reshape( tf.one_hot( tf_action , 2 ) , [1,2] )

tf_delta_reward = tf.placeholder( dtype=tf.float32 )
tf_value        = tf.placeholder( dtype=tf.float32 )

tf_matrix1   = tf.Variable( tf.random_uniform([4,7], -.001, .001) )
tf_matrix2   = tf.Variable( tf.random_uniform([7,2], -.001, .001) )

tf_logits    = tf.matmul( tf_state_2d , tf_matrix1 ) 
tf_logits    = tf.matmul( tf_logits , tf_matrix2 )


tf_loss = -1 * learning_rate * ( tf_delta_reward + discount * tf_value - tf_logits ) * tf_action_1hot
tf_regularize = tf.reduce_mean( tf.square( tf_matrix1 )) + tf.reduce_mean( tf.square( tf_matrix2 ))
tf_train = tf.train.GradientDescentOptimizer(gradient).minimize( tf_loss + tf_regularize * regularizaiton_factor )


sess = tf.Session()
sess.run( tf.global_variables_initializer() )

def max_Q( state ) :
    actions = sess.run( tf_logits, feed_dict={ tf_state:state } )
    actions = actions[0]
    value = actions.max()
    action = 0 if actions[0] == value else 1
    return action , value


avg_age = 0
for trial in range(1,101) :

    # initialize state
    previous_state = env.reset()
    # initialize action and the value of the expected reward
    action , value = max_Q(previous_state)


    previous_reward = 0
    for age in range(1,301) :
        if trial % 100 == 0 :
            env.render()

        new_state, new_reward, done, info = env.step(action)
        new_state = new_state
        action, value = max_Q(new_state)

        # The cart-pole gym doesn't return a reward of Zero when done.
        if done :
            new_reward = 0

        delta_reward = new_reward - previous_reward

        # learning phase
        sess.run(tf_train, feed_dict={ tf_state:previous_state, tf_action:action, tf_delta_reward:delta_reward, tf_value:value })

        previous_state  = new_state
        previous_reward = new_reward

        if done :
            break

    avg_age = avg_age * 0.95 + age * .05
    if trial % 50 == 0 :
        print "Average age =",int(round(avg_age))," , trial",trial," , discount",discount," , learning_rate",learning_rate," , gradient",gradient
    elif trial % 10 == 0 :
        print int(round(avg_age)),

这是输出：

6 18 23 30 Average age = 36  , trial 50  , discount 0.5  , learning_rate 0.5  , gradient 0.001
38 47 50 53 Average age = 55  , trial 100  , discount 0.5  , learning_rate 0.5  , gradient 0.001

摘要

我无法使用简单的神经网络进行Q学习，以便能够解决 CartPole 问题，但尝试使用不同的NN大小和深度进行实验！

希望您喜欢此代码，欢呼声

深度神经网络不会在训练时更新权重

1 个答案:

以下是代码：

这是输出：

摘要