我有一个机器人类 CleaningRobotEnv(),我正在尝试为策略评估 (policy_eval()) 编写代码。但是,在我的 policy_eval() 函数中,我很难从 CleaningRobotEnv() 类中调用一些东西。主要是变量:prob、next_state、rewards、terminal。有人可以帮我以正确的方式调用这些吗? 机器人类:
class CleaningRobotEnv(object):
"""
@info: Initialise BatterySimulatorEnv object with probabilities for different transitions.
@param: Alpha, beta, gamma and delta (transition probabilities)
"""
def __init__(self, alpha=0.4, beta=0.1, gamma=0.1, delta=0.9):
# State and action definitions
self.nS = len(State)
self.nA = len(Action)
# Transition dynamics p(s'|s,a)
# key: State and action
# value: Possible transitions to other states with associated probabilities
self.transitions = {
# Search action
(State.HIGH, Action.SEARCH): {State.HIGH: 1 - alpha,
State.MEDIUM: (2*alpha)/3,
State.LOW: alpha/3},
(State.MEDIUM, Action.SEARCH): {State.MEDIUM: 1- beta,
State.LOW: beta},
(State.LOW, Action.SEARCH): {State.LOW: 1 - gamma,
State.FLAT: gamma},
# Wait action
(State.HIGH, Action.WAIT): {State.HIGH: 1},
(State.MEDIUM, Action.WAIT):{State.MEDIUM: 1},
(State.LOW, Action.WAIT): {State.LOW: 1},
# Recharge action
(State.HIGH, Action.RECHARGE): {State.HIGH: 1},
(State.LOW, Action.RECHARGE): {State.LOW: 1-delta,
State.MEDIUM: delta},
(State.MEDIUM, Action.RECHARGE): {State.MEDIUM: 1-delta,
State.HIGH: delta},
# Include FLAT State, no transitions away
(State.FLAT, Action.SEARCH): {State.FLAT: 1},
(State.FLAT, Action.WAIT): {State.FLAT: 1},
(State.FLAT, Action.RECHARGE): {State.FLAT: 1}
}
# Reward definition is dictionary with key action and value reward
self.rewards = {
Action.SEARCH: 10,
Action.WAIT: 5,
Action.RECHARGE: 0,
State.FLAT: -10
}
# Initialise starting state to high
self.state = State.HIGH
"""
@info: Compute reward calculates the reward based on the given action
@param: action: Tuple of current position in x, position in y and heading.
@return: reward value.
"""
def compute_reward(self, state, action):
if state == State.FLAT:
return self.rewards[state]
else:
return self.rewards[action]
"""
@info: Checks if the state is a terminal state.
@param: state: Tuple of current position in x, position in y and heading.
@return: True (this is a terminal state) or False (this is not a terminal state).
"""
def is_terminal(self, state):
return state == State.FLAT
"""
@info: Compute transition from current state and action.
@param: Current state.
Action.
@return: next_state
"""
def reset(self, random_state=False):
if random_state:
state = np.random.choice(self.nS)
else:
# Start off with high battery
state = State.HIGH
self.state = state
return state
"""
@info: Step function performs one step in MDP.
@param: Action.
@return: next_state, reward, terminal
"""
def step(self, action):
# With dynamics function compute all possible state_prob, new_state, reward, terminal
state_probs, next_states, rewards, terminals = self.dynamics(self.state, action)
# Sample a new_state, reward and terminal
i = np.random.choice(np.size(state_probs), p=state_probs)
self.state = next_states[i]
return next_states[i], rewards[i], terminals[i]
"""
@info: Compute transition from current state and action.
@param: Current state.
Action.
@return: next_state
"""
def dynamics(self, state, action):
# Get transition probabilities at current state
transition_probs = self.transitions[(state, action)]
# Iterate and store through possible states and associated probabilities
next_states = []
state_probs = []
rewards = []
terminals = []
for (s, s_prob) in transition_probs.items():
next_states.append(s)
state_probs.append(s_prob)
rewards.append(self.compute_reward(s, action))
terminals.append(self.is_terminal(s))
return state_probs, next_states, rewards, terminals
def render(self):
print("State: ", self.state)
策略评估函数:
robot_env = CleaningRobotEnv()
transition_probs = robot_env.transitions[(state, action)]
def policy_eval(policy, robot_env, discount_factor=1.0, theta=0.00001):
"""
Evaluate a policy given an environment and a full description of the environment's dynamics.
Args:
policy: [S, A] shaped matrix representing the policy.
env: OpenAI env. env.P represents the transition probabilities of the environment.
env.nS is a number of states in the environment.
env.nA is a number of actions in the environment.
theta: We stop evaluation once our value function change is less than theta for all states.
discount_factor: Gamma discount factor.
Returns:
Vector of length env.nS representing the value function.
"""
# Start with a random (all 0) value function
V = np.zeros(robot_env.nS)
while True:
delta = 0
# For each state, perform a "full backup"
for s in range(robot_env.nS):
v = 0
# Look at the possible next actions
for a, action_prob in enumerate(policy[s]):
# For each action, look at the possible next states...
next_state, rewards, done = robot_env.step(a) #<----------PROBLEM STARTS
for s, s_prob in robot_env.transitions[(state, action)].items():
prob = s_prob
v += action_prob * prob * (reward + discount_factor * V[next_state])
# How much our value function changed (across any states)
delta = max(delta, np.abs(v - V[s]))
V[s] = v
# Stop evaluating once our value function change is below a threshold
if delta < theta:
break
return np.array(V)
我用来测试的内容:
policy = np.zeros([robot_env.nS, robot_env.nA])
# Policy when state is high
policy[State.HIGH, Action.WAIT] = 0.2
policy[State.HIGH, Action.SEARCH] = 0.8
policy[State.HIGH, Action.RECHARGE] = 0
policy[State.MEDIUM, Action.WAIT] = 0.2
policy[State.MEDIUM, Action.SEARCH] = 0.8
policy[State.MEDIUM, Action.RECHARGE] = 0
# Policy when state is low
policy[State.LOW, Action.WAIT] = 0.8
policy[State.LOW, Action.SEARCH] = 0.1
policy[State.LOW, Action.RECHARGE] = 0.1
v = policy_eval(policy, robot_env)
print(v)
如果您想要所有相关文件,请告诉我,但这是我无法修复我的功能的问题。非常感谢!
UPDATEEEE:我已经做了一些事情来修复 policy_eval 函数,如下所示。唯一的问题是 delta 在应该减少的时候增加,需要吗?
robot_env = CleaningRobotEnv()
def policy_eval(policy, env, discount_factor=1.0, theta=0.00001):
# Start with a random (all 0) value function
V = np.zeros(env.nS)
while True:
delta = 0
for s in range(env.nS):
v = 0
for a, action_prob in enumerate(policy[s]):
(probs,next_states, rewards, dones) = env.dynamics(s,a)
for next_state in range(np.size(next_states)):
for prob in range(np.size(probs)):
for reward in range(np.size(rewards)):
for done in range(np.size(dones)):
v += action_prob *prob * (reward + discount_factor * V[next_state])
delta = max(delta, np.abs(v - V[s]))
V[s] = v
if delta < theta:
break
return np.array(V)