我完全迷住了以下代码。我是编程新手,这是使用openAI mountaincar环境进行强化学习的一部分。我知道错误一定是我如何启动MLPClassifier以及在“ generate_session()”函数中。我已经玩了一个多星期,尝试其他方法,但是我在fit()方法中不断遇到尺寸不匹配错误。我没有足够的经验来调试它。如果有人可以在这里帮助我,我将不胜感激。谢谢。
import os
import time
import gym
import numpy as np
import random
import matplotlib.pyplot as plt
get_ipython().magic('matplotlib inline')
env = gym.make("MountainCar-v0").env # creates the environment
env.reset() # resets the environment
n_actions = env.action_space.n # defines the number of available actions
#create agent
from sklearn.neural_network import MLPClassifier
agent = MLPClassifier(hidden_layer_sizes=(20,20),
activation='relu',
warm_start=True, #keep progress between .fit(...) calls
max_iter=1, #make only 1 iteration on each .fit(...)
learning_rate_init = 0.3
)
#initialize agent to the dimension of state an amount of actions
agent.fit([env.reset()]*n_actions, list(range(n_actions)));
# I suspect the above line is where I must be making the mistake.
# Printing out relevant information about the environment
print("Undertstanding the environment")
print("Number of actions: ", env.action_space)
print("allowed actions: ", [env.action_space.sample() for _ in range(10)])
print("0 for left, 1 for stop and 2 for right\n")
print("Observation space:", env.observation_space)
print("Observation space:\n car position (x), car velocity (x')")
print("env.observation_space high: ", env.observation_space.high)
print("env.observation_space low: ", env.observation_space.low, "\n")
print("\nSetting the observation space maximums/minimums")
print("Ex:\nenv.observation_space.high[1] = 0.1 (velocity)")
# generating sessions which depend only on the previous 5 sessions
from threading import current_thread
def generate_session(t_max=10**4):
print("running process", {os.getpid()})
print("current running thread: ", current_thread().name)
epsilon = 0.0
r = 0
s = env.reset()
states,actions = [],[]
total_reward = 0.
prior = 5
for t in range(t_max):
probs = agent.predict_proba([s]) # this is another line that I suspect that I am making a mistake.
if random.uniform(0,1) < epsilon:
a = env.action_space.sample()
else:
#a = policy(s[1])
a = np.random.choice(n_actions, p = np.squeeze(probs))
new_s, r, done, info = env.step(a)
s = new_s
#record sessions
states.append(s)
actions.append(a)
total_reward += r
if done: break
return states[-prior:], actions[-prior:], total_reward
# selecting only states, their corresponding actions and the total rewards, which are greater than the given percentile.
def select_elites(states_batch,actions_batch,rewards_batch,percentile=50):
reward_threshold = np.percentile(rewards_batch, percentile)
ind = [i for i, rewards in enumerate(rewards_batch) if rewards >= reward_threshold]
elite_states = [j for i in ind for j in states_batch[i]]
elite_actions = [j for i in ind for j in actions_batch[i]]
return elite_states, elite_actions
from IPython.display import clear_output
def show_progress(batch_rewards, log, percentile, reward_range): #reward_range=[-990,+10]):
"""
A convenience function that displays training progress.
No cool math here, just charts.
"""
mean_reward, threshold = np.mean(batch_rewards), np.percentile(batch_rewards, percentile)
log.append([mean_reward, threshold])
clear_output(True)
print("mean reward = %.3f, threshold=%.3f"%(mean_reward, threshold))
plt.figure(figsize=[8,4])
plt.subplot(1,2,1)
plt.plot(list(zip(*log))[0], label='Mean rewards')
plt.plot(list(zip(*log))[1], label='Reward thresholds')
plt.legend()
plt.grid()
plt.subplot(1,2,2)
plt.hist(batch_rewards, range=reward_range);
plt.vlines([np.percentile(batch_rewards, percentile)], [0], [100], label="percentile", color='red')
plt.legend()
plt.grid()
plt.show()
#Training loop
# Generate sessions, select N best and fit to those.
# The following three functions gives us a choice to use multithreading using with multiprocessing module or concurrent.futures module or not to use multithreading (just change the choice in the main() function before running it). I am still working on multithreading.
# Running the session and generating the graphs
# No multithreading
def create_graphs_no_mp(n_sessions=100, percentile=50, epochs = 100, t_max = 10**4):
log = []
print("No multiprocessing is used")
for i in range(epochs):
tt = time.time()
sessions = [generate_session(t_max=t_max) for _ in range(n_sessions)]
batch_states,batch_actions,batch_rewards = map(np.array, zip(*sessions))
elite_states, elite_actions = select_elites(batch_states,batch_actions,batch_rewards,percentile)
agent.fit(elite_states, elite_actions)
show_progress(batch_rewards, log, percentile, reward_range=[max(batch_rewards),np.max(batch_rewards)])
print("epoch ", i)
print("time taken for epoch {} = {}".format(i, (time.time()-tt)))
if np.mean(batch_rewards)> -151:
print("You Win! You may stop training now via KeyboardInterrupt.")
# Running the session and generating the graphs
# using multiprocessing module
def create_graphs_mp(n_sessions=100, percentile=50, epochs = 100, t_max = 10**4):
log = []
import multiprocessing as mp
print("multiprocessing module is used")
for i in range(epochs):
tt = time.time()
pool = mp.Pool(processes = 5)
sessions = [pool.apply(generate_session) (t_max) for _ in range(n_sessions)]
batch_states,batch_actions,batch_rewards = pool.map(np.array, zip(*sessions))
elite_states, elite_actions = pool.apply(select_elites, (batch_states,batch_actions,batch_rewards,percentile))
agent.fit(elite_states, elite_actions)
show_progress(batch_rewards, log, percentile, reward_range=[max(batch_rewards),np.max(batch_rewards)])
print("epoch ", i)
print("time taken for epoch {} = {}".format(i, (time.time()-tt)))
if np.mean(batch_rewards)> -151:
print("You Win! You may stop training now via KeyboardInterrupt.")
# Running the session and generating the graphs
# using concurrent.futures
def create_graphs_cf(n_sessions=100, percentile=50, epochs = 100, t_max = 10**4):
import itertools
log = []
sessions =[]
param = [t_max]*n_sessions
#param = t_max
import concurrent.futures
print("concurrent futures module is used")
for i in range(epochs):
tt = time.time()
with concurrent.futures.ProcessPoolExecutor(max_workers=6) as executor:
#with concurrent.futures.ThreadPoolExecutor(max_workers = 6) as executor:
sessions.append([executor.map(generate_session, param) for _ in range(n_sessions)])
#sessions_result = [sessions[i].result() for i in range(len(sessions))]
batch_states,batch_actions,batch_rewards = map(np.array, zip(*sessions[0]))
elite_states, elite_actions = select_elites(batch_states,batch_actions,batch_rewards,percentile)
agent.fit(elite_states, elite_actions)
show_progress(batch_rewards, log, percentile, reward_range=[max(batch_rewards),np.max(batch_rewards)])
print("epoch ", i)
print("time taken for epoch {} = {}".format(i, (time.time()-tt)))
if np.mean(batch_rewards)> -151:
print("You Win! You may stop training now via KeyboardInterrupt.")
if __name__ == "__main__":
def main():
OPTION = 'no_mp' # change to any of the following.
# OPTION = 'cf': multiprocessing using concurrent futures module (default)
# OPTION = 'mp': multiprocessing module used
# OPTION = 'no_mp': no multiprocessing used
n_sessions = 10 # this is usually 200
percentile = 60
epochs =5 # this is usually 100
t_max = 1000 # this is usually 10**4
if OPTION == 'no_mp':
create_graphs_no_mp(n_sessions, percentile, epochs, t_max)
elif OPTION == 'mp':
create_graphs_mp(n_sessions, percentile, epochs, t_max)
else:
create_graphs_cf(n_sessions, percentile, epochs, t_max)
main()