Question

我完全迷住了以下代码。我是编程新手，这是使用openAI mountaincar环境进行强化学习的一部分。我知道错误一定是我如何启动MLPClassifier以及在“ generate_session（）”函数中。我已经玩了一个多星期，尝试其他方法，但是我在fit（）方法中不断遇到尺寸不匹配错误。我没有足够的经验来调试它。如果有人可以在这里帮助我，我将不胜感激。谢谢。

import os
import time
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')

env = gym.make("MountainCar-v0").env   # creates the environment
env.reset() # resets the environment
n_actions = env.action_space.n  # defines the number of available actions


#create agent
from sklearn.neural_network import MLPClassifier
agent = MLPClassifier(hidden_layer_sizes=(20,20),
                      activation='relu',
                      warm_start=True, #keep progress between .fit(...) calls
                      max_iter=1, #make only 1 iteration on each .fit(...)
                      learning_rate_init = 0.3
                     )
#initialize agent to the dimension of state an amount of actions
agent.fit([env.reset()]*n_actions, list(range(n_actions)));

# I suspect the above line is where I must be making the mistake.


# Printing out relevant information about the environment                          

print("Undertstanding the environment")
print("Number of actions: ", env.action_space)
print("allowed actions: ", [env.action_space.sample() for _ in range(10)])
print("0 for left, 1 for stop and 2 for right\n")
print("Observation space:", env.observation_space)
print("Observation space:\n car position (x), car velocity (x')")
print("env.observation_space high: ", env.observation_space.high)
print("env.observation_space low: ", env.observation_space.low, "\n")
print("\nSetting the observation space maximums/minimums")
print("Ex:\nenv.observation_space.high[1] = 0.1 (velocity)")


# generating sessions which depend only on the previous 5 sessions
from threading import current_thread
def generate_session(t_max=10**4):
    print("running process", {os.getpid()})
    print("current running thread: ", current_thread().name)
    epsilon = 0.0
    r = 0
    s = env.reset()
    states,actions = [],[]
    total_reward = 0.
    prior = 5

    for t in range(t_max):

        probs = agent.predict_proba([s])  # this is another line that I suspect that I am making a mistake.
        if random.uniform(0,1) < epsilon:
            a = env.action_space.sample() 
        else:
            #a = policy(s[1])
            a = np.random.choice(n_actions, p = np.squeeze(probs))

        new_s, r, done, info = env.step(a)

        s = new_s
        #record sessions 
        states.append(s)
        actions.append(a)
        total_reward += r



        if done: break
    return states[-prior:], actions[-prior:], total_reward


# selecting only states, their corresponding actions and the total rewards, which are greater than the given percentile.

def select_elites(states_batch,actions_batch,rewards_batch,percentile=50):
    reward_threshold = np.percentile(rewards_batch, percentile)
    ind = [i for i, rewards in enumerate(rewards_batch) if rewards >= reward_threshold]

    elite_states  = [j for i in ind for j in states_batch[i]]
    elite_actions = [j for i in ind for j in actions_batch[i]]


    return elite_states, elite_actions


from IPython.display import clear_output

def show_progress(batch_rewards, log, percentile, reward_range): #reward_range=[-990,+10]):
    """
    A convenience function that displays training progress. 
    No cool math here, just charts.
    """

    mean_reward, threshold = np.mean(batch_rewards), np.percentile(batch_rewards, percentile)
    log.append([mean_reward, threshold])

    clear_output(True)
    print("mean reward = %.3f, threshold=%.3f"%(mean_reward, threshold))
    plt.figure(figsize=[8,4])
    plt.subplot(1,2,1)
    plt.plot(list(zip(*log))[0], label='Mean rewards')
    plt.plot(list(zip(*log))[1], label='Reward thresholds')
    plt.legend()
    plt.grid()

    plt.subplot(1,2,2)
    plt.hist(batch_rewards, range=reward_range);
    plt.vlines([np.percentile(batch_rewards, percentile)], [0], [100], label="percentile", color='red')
    plt.legend()
    plt.grid()

    plt.show()



#Training loop
# Generate sessions, select N best and fit to those.

# The following three functions gives us a choice to use multithreading using with multiprocessing module or concurrent.futures module or not to use multithreading (just change the choice in the main() function before running it). I am still working on multithreading.
# Running the session and generating the graphs
# No multithreading
def create_graphs_no_mp(n_sessions=100, percentile=50, epochs = 100, t_max = 10**4):
    log = []
    print("No multiprocessing is used")
    for i in range(epochs):
        tt = time.time()
        sessions = [generate_session(t_max=t_max) for _ in range(n_sessions)]
        batch_states,batch_actions,batch_rewards = map(np.array, zip(*sessions))
        elite_states, elite_actions = select_elites(batch_states,batch_actions,batch_rewards,percentile)    
        agent.fit(elite_states, elite_actions)
        show_progress(batch_rewards, log, percentile, reward_range=[max(batch_rewards),np.max(batch_rewards)])
        print("epoch ", i)
        print("time taken for epoch {} = {}".format(i, (time.time()-tt)))
        if np.mean(batch_rewards)> -151:
            print("You Win! You may stop training now via KeyboardInterrupt.")


# Running the session and generating the graphs
# using multiprocessing module
def create_graphs_mp(n_sessions=100, percentile=50, epochs = 100, t_max = 10**4):
    log = []
    import multiprocessing as mp
    print("multiprocessing module is used")
    for i in range(epochs):

        tt = time.time()  
        pool = mp.Pool(processes = 5)
        sessions = [pool.apply(generate_session) (t_max) for _ in range(n_sessions)]
        batch_states,batch_actions,batch_rewards = pool.map(np.array, zip(*sessions))
        elite_states, elite_actions = pool.apply(select_elites, (batch_states,batch_actions,batch_rewards,percentile))

        agent.fit(elite_states, elite_actions)
        show_progress(batch_rewards, log, percentile, reward_range=[max(batch_rewards),np.max(batch_rewards)])
        print("epoch ", i)
        print("time taken for epoch {} = {}".format(i, (time.time()-tt)))
        if np.mean(batch_rewards)> -151:
            print("You Win! You may stop training now via KeyboardInterrupt.")

# Running the session and generating the graphs
# using concurrent.futures
def create_graphs_cf(n_sessions=100, percentile=50, epochs = 100, t_max = 10**4):
    import itertools
    log = []
    sessions =[]
    param = [t_max]*n_sessions
    #param = t_max
    import concurrent.futures
    print("concurrent futures module is used")
    for i in range(epochs):

        tt = time.time()    

        with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
        #with concurrent.futures.ThreadPoolExecutor(max_workers = 6) as executor:    
            sessions.append([executor.map(generate_session, param) for _ in range(n_sessions)])
        #sessions_result = [sessions[i].result() for i in range(len(sessions))]
        batch_states,batch_actions,batch_rewards = map(np.array, zip(*sessions[0]))
        elite_states, elite_actions = select_elites(batch_states,batch_actions,batch_rewards,percentile)

        agent.fit(elite_states, elite_actions)
        show_progress(batch_rewards, log, percentile, reward_range=[max(batch_rewards),np.max(batch_rewards)])
        print("epoch ", i)
        print("time taken for epoch {} = {}".format(i, (time.time()-tt)))
        if np.mean(batch_rewards)> -151:
            print("You Win! You may stop training now via KeyboardInterrupt.")


if __name__ == "__main__":
    def main():
        OPTION = 'no_mp' # change to any of the following.
        # OPTION = 'cf': multiprocessing using concurrent futures module (default)
        # OPTION = 'mp': multiprocessing module used
        # OPTION = 'no_mp': no multiprocessing used
        n_sessions = 10 # this is usually 200
        percentile = 60
        epochs =5 # this is usually 100
        t_max = 1000 # this is usually 10**4
        if OPTION == 'no_mp':
            create_graphs_no_mp(n_sessions, percentile, epochs, t_max)
        elif OPTION == 'mp':
            create_graphs_mp(n_sessions, percentile, epochs, t_max)
        else:
            create_graphs_cf(n_sessions, percentile, epochs, t_max)


main()

MLPClassifier拟合方法尺寸不匹配

0 个答案: