强化学习中的政策评估

时间:2021-02-23 09:20:56

标签: python arrays function robot

我有一个机器人类 CleaningRobotEnv(),我正在尝试为策略评估 (policy_eval()) 编写代码。但是,在我的 policy_eval() 函数中,我很难从 CleaningRobotEnv() 类中调用一些东西。主要是变量:prob、next_state、rewards、terminal。有人可以帮我以正确的方式调用这些吗? 机器人类:

class CleaningRobotEnv(object):
"""
@info: Initialise BatterySimulatorEnv object with probabilities for different transitions.
@param: Alpha, beta, gamma and delta (transition probabilities)
"""

def __init__(self, alpha=0.4, beta=0.1, gamma=0.1, delta=0.9):
    # State and action definitions
    self.nS = len(State)
    self.nA = len(Action)

    # Transition dynamics p(s'|s,a)
    # key: State and action
    # value: Possible transitions to other states with associated probabilities
    self.transitions = {
        # Search action
        (State.HIGH, Action.SEARCH): {State.HIGH: 1 - alpha,
                                        State.MEDIUM: (2*alpha)/3,
                                        State.LOW: alpha/3},
        (State.MEDIUM, Action.SEARCH): {State.MEDIUM: 1- beta,
                                        State.LOW: beta},
        (State.LOW, Action.SEARCH): {State.LOW: 1 - gamma,
                                       State.FLAT: gamma},

        # Wait action
        (State.HIGH, Action.WAIT): {State.HIGH: 1},
        (State.MEDIUM, Action.WAIT):{State.MEDIUM: 1},
        (State.LOW, Action.WAIT): {State.LOW: 1},

        # Recharge action
        (State.HIGH, Action.RECHARGE): {State.HIGH: 1},
        (State.LOW, Action.RECHARGE): {State.LOW: 1-delta,
                                       State.MEDIUM: delta},
        (State.MEDIUM, Action.RECHARGE): {State.MEDIUM: 1-delta,
                                          State.HIGH: delta},

        # Include FLAT State, no transitions away
        (State.FLAT, Action.SEARCH): {State.FLAT: 1},
        (State.FLAT, Action.WAIT): {State.FLAT: 1},
        (State.FLAT, Action.RECHARGE): {State.FLAT: 1}
    }

    # Reward definition is dictionary with key action and value reward
    self.rewards = {
        Action.SEARCH: 10,
        Action.WAIT: 5,
        Action.RECHARGE: 0,
        State.FLAT: -10
    }

    # Initialise starting state to high
    self.state = State.HIGH

"""
@info: Compute reward calculates the reward based on the given action
@param: action: Tuple of current position in x, position in y and heading.
@return: reward value.
"""
def compute_reward(self, state, action):
    if state == State.FLAT:
        return self.rewards[state]
    else:
        return self.rewards[action]

"""
@info: Checks if the state is a terminal state.
@param: state: Tuple of current position in x, position in y and heading.
@return: True (this is a terminal state) or False (this is not a terminal state).
"""
def is_terminal(self, state):
    return state == State.FLAT

"""
@info: Compute transition from current state and action.
@param: Current state.
        Action.
@return: next_state
"""
def reset(self, random_state=False):
    if random_state:
        state = np.random.choice(self.nS)
    else:
        # Start off with high battery
        state = State.HIGH
    self.state = state
    return state

"""
@info: Step function performs one step in MDP.
@param: Action.
@return: next_state, reward, terminal
"""
def step(self, action):
    # With dynamics function compute all possible state_prob, new_state, reward, terminal
    state_probs, next_states, rewards, terminals = self.dynamics(self.state, action)
    # Sample a new_state, reward and terminal
    i = np.random.choice(np.size(state_probs), p=state_probs)
    self.state = next_states[i]
    return next_states[i], rewards[i], terminals[i]

"""
@info: Compute transition from current state and action.
@param: Current state.
        Action.
@return: next_state
"""
def dynamics(self, state, action):
    # Get transition probabilities at current state
    transition_probs = self.transitions[(state, action)]

    # Iterate and store through possible states and associated probabilities
    next_states = []
    state_probs = []
    rewards = []
    terminals = []
    for (s, s_prob) in transition_probs.items():
        next_states.append(s)
        state_probs.append(s_prob)
        rewards.append(self.compute_reward(s, action))
        terminals.append(self.is_terminal(s))

    return state_probs, next_states, rewards, terminals

def render(self):
    print("State: ", self.state)

策略评估函数:

robot_env = CleaningRobotEnv()
transition_probs = robot_env.transitions[(state, action)]

def policy_eval(policy, robot_env, discount_factor=1.0, theta=0.00001):
    """
    Evaluate a policy given an environment and a full description of the environment's dynamics.
    
Args:
    policy: [S, A] shaped matrix representing the policy.
    env: OpenAI env. env.P represents the transition probabilities of the environment.
         
        env.nS is a number of states in the environment. 
        env.nA is a number of actions in the environment.
    theta: We stop evaluation once our value function change is less than theta for all states.
    discount_factor: Gamma discount factor.

Returns:
    Vector of length env.nS representing the value function.
"""
# Start with a random (all 0) value function
V = np.zeros(robot_env.nS)
while True:
    delta = 0
    # For each state, perform a "full backup"
    for s in range(robot_env.nS):
        v = 0
        # Look at the possible next actions
        for a, action_prob in enumerate(policy[s]):
            # For each action, look at the possible next states...
            next_state, rewards, done = robot_env.step(a)                     #<----------PROBLEM STARTS 
            for s, s_prob in robot_env.transitions[(state, action)].items():
                prob = s_prob
                v += action_prob * prob * (reward + discount_factor * V[next_state])
        # How much our value function changed (across any states)
        delta = max(delta, np.abs(v - V[s]))
        V[s] = v
    # Stop evaluating once our value function change is below a threshold
    if delta < theta:
        break
return np.array(V)

我用来测试的内容:

policy = np.zeros([robot_env.nS, robot_env.nA])
# Policy when state is high
policy[State.HIGH, Action.WAIT] = 0.2
policy[State.HIGH, Action.SEARCH] = 0.8
policy[State.HIGH, Action.RECHARGE] = 0

policy[State.MEDIUM, Action.WAIT] = 0.2
policy[State.MEDIUM, Action.SEARCH] = 0.8
policy[State.MEDIUM, Action.RECHARGE] = 0
# Policy when state is low
policy[State.LOW, Action.WAIT] = 0.8
policy[State.LOW, Action.SEARCH] = 0.1
policy[State.LOW, Action.RECHARGE] = 0.1

v = policy_eval(policy, robot_env)
print(v)

如果您想要所有相关文件,请告诉我,但这是我无法修复我的功能的问题。非常感谢!

UPDATEEEE:我已经做了一些事情来修复 policy_eval 函数,如下所示。唯一的问题是 delta 在应该减少的时候增加,需要吗?

robot_env = CleaningRobotEnv()

def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):

# Start with a random (all 0) value function
V = np.zeros(env.nS)
while True:
    delta = 0
    for s in range(env.nS):
        v = 0
        for a, action_prob in enumerate(policy[s]):
            (probs,next_states, rewards, dones) = env.dynamics(s,a)
            for next_state in range(np.size(next_states)):
                for prob in range(np.size(probs)):
                    for reward in range(np.size(rewards)):
                        for done in range(np.size(dones)):
                            v += action_prob *prob * (reward + discount_factor * V[next_state])
        delta = max(delta, np.abs(v - V[s]))
        V[s] = v
    if delta < theta:
        break
return np.array(V)

0 个答案:

没有答案