所以我现在正在观看与 Markov Chain 相关的代码,问题是我不知道这段代码是如何工作的。
transition_probabilities = [
[[0.7, 0.3, 0.0], [1.0, 0.0, 0.0], [0.8, 0.2, 0.0]], # in s0, if action a0 then proba 0.7 to state s0 and 0.3 to state s1, etc.
[[0.0, 1.0, 0.0], None, [0.0, 0.0, 1.0]],
[None, [0.8, 0.1, 0.1], None],
]
rewards = [
[[+10, 0, 0], [0, 0, 0], [0, 0, 0]],
[[0, 0, 0], [0, 0, 0], [0, 0, -50]],
[[0, 0, 0], [+40, 0, 0], [0, 0, 0]],
]
possible_actions = [[0, 1, 2], [0, 2], [1]]
def policy_fire(state):
return [0, 2, 1][state]
def policy_random(state):
return rnd.choice(possible_actions[state])
def policy_safe(state):
return [0, 0, 1][state]
class MDPEnvironment(object):
def __init__(self, start_state=0):
self.start_state=start_state
self.reset()
def reset(self):
self.total_rewards = 0
self.state = self.start_state
def step(self, action):
next_state = rnd.choice(range(3), p=transition_probabilities[self.state][action])
reward = rewards[self.state][action][next_state]
self.state = next_state
self.total_rewards += reward
return self.state, reward
def run_episode(policy, n_steps, start_state=0, display=True):
env = MDPEnvironment()
if display:
print("States (+rewards):", end=" ")
for step in range(n_steps):
if display:
if step == 10:
print("...", end=" ")
elif step < 10:
print(env.state, end=" ")
action = policy(env.state)
state, reward = env.step(action)
if display and step < 10:
if reward:
print("({})".format(reward), end=" ")
if display:
print("Total rewards =", env.total_rewards)
return env.total_rewards
for policy in (policy_fire, policy_random, policy_safe):
all_totals = []
print(policy.__name__)
for episode in range(1000):
all_totals.append(run_episode(policy, n_steps=100, display=(episode<5)))
print("Summary: mean={:.1f}, std={:1f}, min={}, max={}".format(np.mean(all_totals), np.std(all_totals), np.min(all_totals), np.max(all_totals)))
print()
在第16行中,有一个函数只返回索引。我从来没有听说过这样的事情。我所知道的索引是它必须有一个相应的 l_value ,以便索引可以订阅数组内的元素或任何东西。那么你们能告诉我发生了什么事吗?
答案 0 :(得分:1)
第15-16行:
def policy_fire(state):
return [0, 2, 1][state]
此函数假定state
是0
和2
之间的整数,并使用该值索引列表[0, 2, 1]
并返回结果值。所以例如policy_fire(1)
将返回2
。
此函数在函数run_episode
中调用,env.state
作为参数,其中env = MDPEnvironment()
。