模型的目标是使用强化学习来预测随机数。问题似乎出在act函数上,我得到的错误是assert self.action_space.contains(action)上的AssertionError,一旦act返回Q,就会发生该错误。当我为什么丢失其输出-0.32666808而不是有效结果时,我迷失了喜欢108.2323任何帮助将不胜感激。
编辑: 为了清楚起见,我尝试从here移植代码,该代码最初是为解决CartPole-v0体育馆环境而设计的。我正在尝试修改此代码以解决HotterColder体育馆环境。
import numpy as np
import gym
import random
from gym import spaces
from gym.utils import seeding
from collections import deque
from keras.layers import Input, Activation, Dense, Flatten, RepeatVector, Reshape
from keras.layers.convolutional import Conv2D
from keras.models import Model,Sequential
from keras import backend as K
from keras import optimizers
class HotterColder(gym.Env):
"""Hotter Colder
The goal of hotter colder is to guess closer to a randomly selected number
After each step the agent receives an observation of:
0 - No guess yet submitted (only after reset)
1 - Guess is lower than the target
2 - Guess is equal to the target
3 - Guess is higher than the target
The rewards is calculated as:
(min(action, self.number) + self.range) / (max(action, self.number) + self.range)
Ideally an agent will be able to recognise the 'scent' of a higher reward and
increase the rate in which is guesses in that direction until the reward reaches
its maximum
"""
def __init__(self):
self.range = 100 # +/- value the randomly select number can be between
self.bounds = 200 # Action space bounds
self.action_space = spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds]))
#self.action_space = spaces.Discrete(self.bounds)
self.observation_space = spaces.Discrete(3)
#self.observation_space = spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds]))
#self.observation_space = spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds]))
#self.action_space = spaces.Discrete(4)#spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds]))
self.number = 0
self.guess_count = 0
self.guess_max = 2000
self.observation = 0
self.seed()
self.reset()
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def step(self, action):
assert self.action_space.contains(action)
if action < self.number:
self.observation = 1
#print("num too low")
elif action == self.number:
self.observation = 2
print("Action=" + str(action) + " Number Goal=" + str(self.number))
print("Mission complete " + str(self.guess_count))
self.done = True
self.reset()
#self.guess_count = 0
#time.sleep(2)
elif action > self.number:
self.observation = 3
#print('num too high')
print("Action=" + str(action) + " Number Goal=" + str(self.number) + " Observation=" + str(self.observation))
reward = ((min(action, self.number) + self.bounds) / (max(action, self.number) + self.bounds)) ** 2
self.guess_count += 1
done = self.guess_count >= self.guess_max
#print(reward)
return self.observation, reward, done, {"number": self.number, "guesses": self.guess_count}
def reset(self):
#self.number = self.np_random.uniform(-self.range,self.range)
self.number = self.np_random.uniform(-self.range,self.range, size=(1,))
self.guess_count = 0
self.observation = 0
return self.observation
class Agent:
def __init__(self, env):
self.env = env
self.input_dim = 1
self.output_dim = env.action_space.shape[0]
print(self.output_dim)
self.create_model()
def create_model(self):
self.model = Sequential()
self.model.add(Dense(3,input_shape=(1,)))
self.model.add(Dense(8))
self.model.add(Activation('relu'))
self.model.add(Dense(32))
self.model.add(Activation('relu'))
self.model.add(Dense(16))
self.model.add(Activation('relu'))
self.model.add(Dense(self.output_dim))
#model.add(Activation('softmax'))
#sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
#self.model = Model(inputs=X, outputs=net)
self.model.compile( loss='mean_squared_error',optimizer='rmsprop',metrics=['accuracy'])
#print(self.model.summary())
#self.model = model
#return model
def act(self, X, eps=1.0):
if np.random.rand() < eps:
return self.env.action_space.sample()
print(self.env.action_space.sample())
#X = float(X)
print(X)
X = np.asarray(X,)
#print(X)
X = X.reshape(-1, )
print(X)
#print(X.shape)
#print(self.env.action_space.sample())
#print(X)
#X = np.array(X)
Q = self.model.predict(X)
print(Q)
print(np.argmax(Q, 1)[0])
#return np.argmax(Q, 1)[0]
return(Q)
def train(self, X_batch, y_batch):
return self.model.train_on_batch(X_batch, y_batch)
def predict(self, X_batch):
#print(self.model.summary())
#print(X_batch)
#time()
return self.model.predict_on_batch(X_batch)
def create_batch(agent, memory, batch_size, discount_rate):
sample = random.sample(memory, batch_size)
sample = np.asarray(sample)
s = sample[:, 0]
a = sample[:, 1].astype(np.int8)
r = sample[:, 2]
s2 = sample[:, 3]
d = sample[:, 4] * 1.
#print(a)
#time.sleep()
#print(s)
X_batch = np.vstack(s)
#print(X_batch)
y_batch = agent.predict(X_batch)
#print(np.max(agent.predict(np.vstack(s2))))
#print(np.arange(batch_size))
#time.sleep(5)
#y_batch[np.arange(batch_size), a] = r + discount_rate * np.max(agent.predict(np.vstack(s2)), 1) * (1 - d)
#print(y_batch)
#print(r + discount_rate * np.max(agent.predict(np.vstack(s2)), 1) * (1 - d))
#y_batch[np.arange(batch_size)] = r + discount_rate * np.max(agent.predict(np.vstack(s2)), 1) * (1 - d)
return X_batch, y_batch
def print_info(episode, reward, eps):
#print("[Episode {episode:>5}] Reward: {reward:>5} EPS: {eps:>3.2f}")
print("Episode " + str(episode) + " Reward " + str(reward) + " EPS " + str(eps))
def main():
n_episode = 1000
discount_rate = 0.99
n_memory = 5000
batch_size = 32
eps = 1.0
min_eps = 0.1
#env_name = 'CartPole-v0'
#env = gym.make(env_name)
env = HotterColder()
agent = Agent(env)
memory = deque()
# CartPole-v0 Clear Condition
# Average reward per episode > 195.0 over 100 episodes
LAST_100_GAME_EPISODE_REWARDS = deque()
for episode in range(n_episode):
done = False
s = env.reset()
print("s: " + str(s))
eps = max(min_eps, eps - 1/(n_episode/2))
episode_reward = 0
while not done:
a = agent.act(s, eps)
s2, r, done, info = env.step(a)
episode_reward += r
if done and episode_reward < 200:
r = -100
memory.append([s, a, r, s2, done])
if len(memory) > n_memory:
memory.popleft()
if len(memory) > batch_size:
X_batch, y_batch = create_batch(agent, memory, batch_size, discount_rate)
agent.train(X_batch, y_batch)
s = s2
print_info(episode, episode_reward, eps)
LAST_100_GAME_EPISODE_REWARDS.append(episode_reward)
if len(LAST_100_GAME_EPISODE_REWARDS) > 100:
LAST_100_GAME_EPISODE_REWARDS.popleft()
if np.mean(LAST_100_GAME_EPISODE_REWARDS) >= 195.0:
#print(f"Game solved in {episode + 1} with average reward {np.mean(LAST_100_GAME_EPISODE_REWARDS)}")
print("Game solved in " + str(episode + 1) + " with average reward " + str(np.mean(LAST_100_GAME_EPISODE_REWARDS)) )
env.close()
if __name__ == '__main__':
main()