Question

我正在使用python中的一个简单的q学习代码。在运行几次迭代之后，程序会建议一个有效的路径，但并不总是最短的 - 这是程序的重点。我不确定我在俯瞰什么。我正在使用一个jupyter笔记本。

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

iterations = 200
goalState = 5
gamma = 0.8
qValues = []

#actions

R = np.array([[-1, 0,-1,-1, 0,-1],
              [ 0,-1,-1,-1,-1,100],
              [-1,-1,-1,-1,-1,100],
              [-1,-1,-1,-1, 0,100],
              [ 0,-1,-1, 0,-1,-1],
              [-1,-1,-1,-1,-1,100]])

#inital Q matrix

Q = np.zeros(R.shape)

for i in range(iterations):

    state = np.random.randint(goalState + 1)

    while state != goalState:

        possibleActions = np.where(R[state] >= 0)[0]

        action = possibleActions[np.random.randint(len(possibleActions+1))]

        nextPossibleActions = np.where(R[action] >= 0)[0]

        for k in nextPossibleActions:
            qValues.append(Q[action][k])

        qMax = max(qValues)

        Q[state][action] += R[state][action] + gamma * qMax

        state = action

Q = Q/Q.max()  #normalising the matrix to percentage values

sns.set()
f, ax = plt.subplots(figsize=(8, 6))
cmap = sns.diverging_palette(220, 10, as_cmap=True)
sns.heatmap(Q, cmap = cmap, annot=True, linewidths=.5, ax=ax)

Python 3中的简单Q学习示例

0 个答案: