Keras模型预测结果无效-AssertionError

时间:2018-08-13 04:39:43

标签: python machine-learning keras reinforcement-learning openai-gym

模型的目标是使用强化学习来预测随机数。问题似乎出在act函数上,我得到的错误是assert self.action_space.contains(action)上的AssertionError,一旦act返回Q,就会发生该错误。当我为什么丢失其输出-0.32666808而不是有效结果时,我迷失了喜欢108.2323任何帮助将不胜感激。

编辑: 为了清楚起见,我尝试从here移植代码,该代码最初是为解决CartPole-v0体育馆环境而设计的。我正在尝试修改此代码以解决HotterColder体育馆环境。

import numpy as np
import gym
import random
from gym import spaces
from gym.utils import seeding
from collections import deque
from keras.layers import Input, Activation, Dense, Flatten, RepeatVector, Reshape
from keras.layers.convolutional import Conv2D
from keras.models import Model,Sequential
from keras import backend as K
from keras import optimizers
class HotterColder(gym.Env):
    """Hotter Colder
    The goal of hotter colder is to guess closer to a randomly selected number
    After each step the agent receives an observation of:
    0 - No guess yet submitted (only after reset)
    1 - Guess is lower than the target
    2 - Guess is equal to the target
    3 - Guess is higher than the target
    The rewards is calculated as:
    (min(action, self.number) + self.range) / (max(action, self.number) + self.range)
    Ideally an agent will be able to recognise the 'scent' of a higher reward and
    increase the rate in which is guesses in that direction until the reward reaches
    its maximum
    """
    def __init__(self):
        self.range = 100 # +/- value the randomly select number can be between
        self.bounds = 200  # Action space bounds

        self.action_space = spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds]))
        #self.action_space =  spaces.Discrete(self.bounds)
        self.observation_space = spaces.Discrete(3)
        #self.observation_space = spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds]))
        #self.observation_space = spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds]))
        #self.action_space = spaces.Discrete(4)#spaces.Box(low=np.array([-self.bounds]), high=np.array([self.bounds]))
        self.number = 0
        self.guess_count = 0
        self.guess_max = 2000
        self.observation = 0

        self.seed()
        self.reset()

    def seed(self, seed=None):
        self.np_random, seed = seeding.np_random(seed)
        return [seed]

    def step(self, action):
        assert self.action_space.contains(action)
        if action < self.number:
            self.observation = 1
            #print("num too low")

        elif action == self.number:
            self.observation = 2
            print("Action=" + str(action) + " Number Goal=" + str(self.number))
            print("Mission complete " + str(self.guess_count))
            self.done = True
            self.reset()
            #self.guess_count = 0
            #time.sleep(2)

        elif action > self.number:
            self.observation = 3
            #print('num too high')
        print("Action=" + str(action) + " Number Goal=" + str(self.number) + " Observation=" + str(self.observation))
        reward = ((min(action, self.number) + self.bounds) / (max(action, self.number) + self.bounds)) ** 2

        self.guess_count += 1
        done = self.guess_count >= self.guess_max
        #print(reward)
        return self.observation, reward, done, {"number": self.number, "guesses": self.guess_count}

    def reset(self):
        #self.number = self.np_random.uniform(-self.range,self.range)
        self.number = self.np_random.uniform(-self.range,self.range, size=(1,))
        self.guess_count = 0
        self.observation = 0
        return self.observation
class Agent:
    def __init__(self, env):
        self.env = env
        self.input_dim = 1
        self.output_dim = env.action_space.shape[0]
        print(self.output_dim)
        self.create_model()

    def create_model(self):
        self.model = Sequential()
        self.model.add(Dense(3,input_shape=(1,)))
        self.model.add(Dense(8))
        self.model.add(Activation('relu'))
        self.model.add(Dense(32))
        self.model.add(Activation('relu'))
        self.model.add(Dense(16))
        self.model.add(Activation('relu'))
        self.model.add(Dense(self.output_dim))
        #model.add(Activation('softmax'))
        #sgd = optimizers.SGD(lr=0.01, decay=1e-6, momentum=0.9, nesterov=True)
        #self.model = Model(inputs=X, outputs=net)
        self.model.compile( loss='mean_squared_error',optimizer='rmsprop',metrics=['accuracy'])
        #print(self.model.summary())
        #self.model = model
        #return model

    def act(self, X, eps=1.0):
        if np.random.rand() < eps:
            return self.env.action_space.sample()
        print(self.env.action_space.sample())
        #X = float(X)
        print(X)
        X = np.asarray(X,)
        #print(X)
        X = X.reshape(-1, )
        print(X)
        #print(X.shape)
        #print(self.env.action_space.sample())
        #print(X)
        #X = np.array(X)
        Q = self.model.predict(X)
        print(Q)
        print(np.argmax(Q, 1)[0])
        #return np.argmax(Q, 1)[0]
        return(Q)

    def train(self, X_batch, y_batch):
        return self.model.train_on_batch(X_batch, y_batch)

    def predict(self, X_batch):
        #print(self.model.summary())
        #print(X_batch)
        #time()
        return self.model.predict_on_batch(X_batch)


def create_batch(agent, memory, batch_size, discount_rate):
    sample = random.sample(memory, batch_size)
    sample = np.asarray(sample)

    s = sample[:, 0]
    a = sample[:, 1].astype(np.int8)
    r = sample[:, 2]
    s2 = sample[:, 3]
    d = sample[:, 4] * 1.
    #print(a)
    #time.sleep()
    #print(s)
    X_batch = np.vstack(s)
    #print(X_batch)
    y_batch = agent.predict(X_batch)
    #print(np.max(agent.predict(np.vstack(s2))))
    #print(np.arange(batch_size))
    #time.sleep(5)
    #y_batch[np.arange(batch_size), a] = r + discount_rate * np.max(agent.predict(np.vstack(s2)), 1) * (1 - d)
    #print(y_batch)
    #print(r + discount_rate * np.max(agent.predict(np.vstack(s2)), 1) * (1 - d))
    #y_batch[np.arange(batch_size)] = r + discount_rate * np.max(agent.predict(np.vstack(s2)), 1) * (1 - d)

    return X_batch, y_batch


def print_info(episode, reward, eps):
    #print("[Episode {episode:>5}] Reward: {reward:>5} EPS: {eps:>3.2f}")
    print("Episode " + str(episode) + " Reward " + str(reward) + " EPS " + str(eps))


def main():
    n_episode = 1000
    discount_rate = 0.99
    n_memory = 5000
    batch_size = 32
    eps = 1.0
    min_eps = 0.1
    #env_name = 'CartPole-v0'
    #env = gym.make(env_name)
    env = HotterColder()
    agent = Agent(env)
    memory = deque()

    # CartPole-v0 Clear Condition
    # Average reward per episode > 195.0 over 100 episodes
    LAST_100_GAME_EPISODE_REWARDS = deque()

    for episode in range(n_episode):
        done = False
        s = env.reset()
        print("s: " + str(s))
        eps = max(min_eps, eps - 1/(n_episode/2))
        episode_reward = 0
        while not done:
            a = agent.act(s, eps)
            s2, r, done, info = env.step(a)
            episode_reward += r

            if done and episode_reward < 200:
                r = -100

            memory.append([s, a, r, s2, done])

            if len(memory) > n_memory:
                memory.popleft()

            if len(memory) > batch_size:
                X_batch, y_batch = create_batch(agent, memory, batch_size, discount_rate)
                agent.train(X_batch, y_batch)

            s = s2

        print_info(episode, episode_reward, eps)
        LAST_100_GAME_EPISODE_REWARDS.append(episode_reward)
        if len(LAST_100_GAME_EPISODE_REWARDS) > 100:
            LAST_100_GAME_EPISODE_REWARDS.popleft()

        if np.mean(LAST_100_GAME_EPISODE_REWARDS) >= 195.0:
            #print(f"Game solved in {episode + 1} with average reward {np.mean(LAST_100_GAME_EPISODE_REWARDS)}")
            print("Game solved in " + str(episode + 1) + " with average reward  " + str(np.mean(LAST_100_GAME_EPISODE_REWARDS)) )

    env.close()


if __name__ == '__main__':
    main()

0 个答案:

没有答案