Question

我正在研究马尔可夫决策过程（MDP），但在某个地方出错。实际上，我的主要数据集有一个错误。数据可能会出现一些问题。但是，您能否检查一下我的代码并说它是最优的，以及我的代码中的主要问题在哪里？因为我在主数据集上得到了奇怪的值。

我尝试了迭代方法。但是，后退要快得多：

import pandas as pd
import numpy as np
import warnings

data = [['3 0', 'UP', 0.6, '3 1', 5], ['3 0', 'UP', 0.4, '3 2', -10], \
        ['3 0', 'RIGHT', 1, '3 3', 10], ['3 1', 'RIGHT', 1, '3 3', 4], \
        ['3 2', 'DOWN', 0.6, '3 3', 3], ['3 2', 'DOWN', 0.4, '3 1', 5], \
        ['3 3', 'RIGHT', 1, 'EXIT', 7], ['EXIT', 'NO', 1, 'EXIT', 0]]

df = pd.DataFrame(data, columns = ['Start', 'Action', 'Probability', 'End', 'Reward'], \
                  dtype = float) #initial matrix

class MDP:

    def __init__(self, gamma, table):
        self.gamma = gamma
        self.table = table

    def Action(self, state):
        return self.table[self.table.Start == state].Action

    def Reward(self, state, end):
        return self.table[(self.table.Start == state) & (self.table.End == end)].Reward.values

    def T(self, state, action):
        T = []
        for i in range(len(self.table[(self.table.Start == state) & (self.table.Action == action)].Probability.values)):
            T.append((self.table[(self.table.Start == state) & (self.table.Action == action)].Probability.values[i], \
                self.table[(self.table.Start == state) & (self.table.Action == action)].End.values[i]))
        return T

def backward():

    states = list(mdp.table.Start) 
    actions = mdp.Action
    rewards = mdp.Reward
    step = len(states)

    V = {s: 0 for s in states}

    for i in reversed(range(step)):

        V[states[i]] = gamma * max([sum([p * (V[s1] + rewards(states[i], s1)) \
                                       for (p, s1) in mdp.T(states[i], a)]) for a in actions(states[i])])
    return V


gamma = 0.85 #discount factor
mdp = MDP(gamma = gamma, table = df)
answer_backward = backward()
answer_backward

我通过线性方程式进行了计算。并获得其他结果，但线性方程式中应该没有错误

gamma = 0.85 #discount factor

point_3_0, point_3_1, point_3_2, point_3_3, point_exit = 0, 0, 0, 0, 0

point_3_0 = gamma * (max(0.6 * (point_3_1 + 5) + 0.4 * (point_3_2 + (-10)), 1 * point_3_3 + 10))
point_3_1 = gamma * (1 * (point_3_3 + 4))
point_3_2 = gamma * (0.4 * (point_3_1 + 5) + 0.6 * (point_3_3 + 3))
point_3_3 = gamma * (1 * (point_exit + 7))

print(point_3_0, point_3_1, point_3_2, point_3_3)

因此，线性方程式和向后函数的输出不同。但是，应该相同。

优化和调试线性方程式的输出

0 个答案: