Question

在强化学习中，典型的例子是windy gridworld

我面对着一个多风的网格世界的变化，它还有一面墙和随机风，我被困在这两件新事物中

图1显示了一个标准的网格世界，包括起始（S）和目标（G）单元格，但有两个差异：那里是一个墙壁，代理人不能穿过（由黑色细胞表示）并且有一个侧风向下留在网格的右边缘。每个单元格中的可用动作都是国王的动作8个动作每个细胞的总数。如果任何操作会将您带到gridworld之外或与墙碰撞，那么您就结束了在最近的单元格中（例如，在左上角的单元格中向东北方向将向右移动一个单元格）。在里面在右边区域，所产生的下一个细胞向下左移一个随机\ wind“，平均强度为它逐列变化。风的平均强度在每列下面给出，数量细胞向左下移。

由于随机性，风有时会从平均值变化1 给出每列（除非均值为0）。也就是说，你有三分之一的时间向左下移完全根据列下方指示的值，您移动一个单元格的时间的三分之一进一步向下并离开那个，并且另外三分之一的时间你移动了一个小于平均值的单元格。例如，如果您在墙的一排和开口的中间，然后向上移动，那么三分之一的时间你最后一排在那个牢房的西边，三分之一的时间你最后两个柱子西，在那个单元格以南一列，三分之一的时间你最后在同一列的北边那个细胞。风会影响你所在的细胞，而不是你要去的细胞。

在= 0：1的上述问题中实现Q学习算法2， = 0：9且初始Q（s; a）= 0 为所有人;一个。除了直接导致的操作外，每个操作都会生成rs = 1的奖励目标细胞（rg = 10）。使用： -Greedy动作选择方法= 0：2。具有初始Q（s，a）的贪婪动作选择方法＆gt; 0和初始Q（s，a）＆lt; 0

我的matlab代码可以使用。

我真正的问题就在 function nextPos = GiveNextPos（curPos，actionIndex，windpowers，gridCols，gridRows），代理将决定某个操作，然后转到下一步。但影响下一步的因素有很多，例如随机风和墙

所以第一个问题是关于随机风的，我怎么能用matlab编程来说1/3的机会，它是3，在另外1/3的几率，它是1 ...

第二个问题是关于碰撞墙吗？我应该首先计算国王的行走和风的下一步，然后使用下一步值来检查我是否碰壁？）

function WindyGridWorldQLearning()

    fprintf('WindyGridWorldQLearning\n'); 

    gamma = 0.9;
    alpha = 0.1;
    epsilon = 0.2;

    gridcols = 10; 
    gridrows = 7;
    windpowers = [0 0 0 0 1 1 2 2 1 1];
    fontsize = 16;
    showTitle = 1;

    episodeCount = 900;
    selectedEpisodes = [900];

    isKing = 1; 
    canHold = 0;

    start.row = 7;
    start.col = 1;
    goal.row = 1;
    goal.col = 1;

selectedEpIndex = 1;
 actionCount = 8; 

% initialize Q with zeros
Q = zeros(gridrows, gridcols, actionCount);

a = 0; % an invalid action
% loop through episodes
for ei = 1:episodeCount,
    %disp(sprintf('Running episode %d', ei));
    curpos = start;
    nextpos = start;

    %epsilon or greedy
    if(rand > epsilon) % greedy
        [qmax, a] = max(Q(curpos.row,curpos.col,:));
    else
        a = IntRand(1, actionCount);
    end

    while(PosCmp(curpos, goal) ~= 0)
        % take action a, observe r, and nextpos
        nextpos = GiveNextPos(curpos, a, windpowers, gridcols, gridrows);
        if(PosCmp(nextpos, goal) ~= 0), r = -1; else r = 10; end

        % choose a_next from nextpos
        [qmax, a_next] = max(Q(nextpos.row,nextpos.col,:));
        if(rand <= epsilon) % explore
            a_next = IntRand(1, actionCount);
        end

        % update Q:
        curQ = Q(curpos.row, curpos.col, a);
        nextQ = qmax; %Q(nextpos.row, nextpos.col, a_next);
        Q(curpos.row, curpos.col, a) = curQ + alpha*(r + gamma*nextQ - curQ);

        curpos = nextpos; a = a_next;
    end % states in each episode

    % if the current state of the world is going to be drawn ...
    if(selectedEpIndex <= length(selectedEpisodes) && ei == selectedEpisodes(selectedEpIndex))
        curpos = start;
        rows = []; cols = []; acts = [];
        for i = 1:(gridrows + gridcols) * 10,
            [qmax, a] = max(Q(curpos.row,curpos.col,:));
            nextpos = GiveNextPos(curpos, a, windpowers, gridcols, gridrows);
            rows = [rows curpos.row];
            cols = [cols curpos.col];
            acts = [acts a];

            if(PosCmp(nextpos, goal) == 0), break; end
            curpos = nextpos;
        end % states in each episode

        %figure;
        figure('Name',sprintf('Episode: %d', ei), 'NumberTitle','off');
        DrawWindyEpisodeState(rows, cols, acts, start.row, start.col, goal.row, goal.col, windpowers, gridrows, gridcols, fontsize);
        if(showTitle == 1),
            title(sprintf('Windy grid-world SARSA - episode %d - (\\epsilon: %3.3f), (\\alpha = %3.4f), (\\gamma = %1.1f)', ei, epsilon, alpha, gamma));
        end

        selectedEpIndex = selectedEpIndex + 1;
    end

end % episodes loop

function c = PosCmp(pos1, pos2)
c = pos1.row - pos2.row;
if(c == 0)
    c = c + pos1.col - pos2.col;
end

function nextPos = GiveNextPos(curPos, actionIndex, windpowers, gridCols, gridRows)
nextPos = curPos;
switch actionIndex
   case 1 % east
       nextPos.col = curPos.col + 1;
   case 2 % south
       nextPos.row = curPos.row + 1;       
       if(nextPos.row ==4 && nextPos.col <= 4 )   nextPos.row = curPos.row;  end     
   case 3 % west
       nextPos.col = curPos.col - 1;
       if(nextPos.row ==4 && nextPos.col <= 4 )   nextPos.col = curPos.col;  end 
   case 4 % north
       nextPos.row = curPos.row - 1;
       if(nextPos.row ==4 && nextPos.col <= 4 )   nextPos.row = curPos.row;  end 
   case 5 % northeast 
       nextPos.col = curPos.col + 1;
       nextPos.row = curPos.row - 1;
       if(nextPos.row ==4 && nextPos.col <= 4 )   nextPos.row = curPos.row;  end 
   case 6 % southeast 
       nextPos.col = curPos.col + 1;
       nextPos.row = curPos.row + 1;
       if(nextPos.row ==4 && nextPos.col <= 4 )   nextPos.row = curPos.row;  end 
   case 7 % southwest
       nextPos.col = curPos.col - 1;
       nextPos.row = curPos.row + 1;
       if(nextPos.row ==4 && nextPos.col <= 4 )   nextPos.row = curPos.row;  end 
   case 8 % northwest
       nextPos.col = curPos.col - 1;
       nextPos.row = curPos.row - 1;
       if(nextPos.row ==4 && nextPos.col <= 4 )   nextPos.row = curPos.row;  end 
   case 9 % hold
       nextPos = curPos;
   otherwise
      disp(sprintf('invalid action index: %d', actionIndex))
end

if(curPos.col > 4)    
    nextPos.row = nextPos.row - windpowers(nextPos.col);
    nextPos.col = nextPos.col - windpowers(nextPos.col);
end



if(nextPos.col <= 0), nextPos.col = 1; end
if(nextPos.col > gridCols), nextPos.col = gridCols; end

if(nextPos.row <= 0), nextPos.row = 1; end
if(nextPos.row > gridRows), nextPos.row = gridRows; end




function n = IntRand(lowerBound, upperBound)
n = floor((upperBound - lowerBound) * rand + lowerBound);




function DrawWindyEpisodeState(rows, cols, acts, SRow, SCol, GRow, GCol, windpowers, gridrows, gridcols, fontsize)
DrawGrid(gridrows, gridcols);
DrawTextOnCell('S', 0, SRow, SCol, gridrows, gridcols, fontsize);
DrawTextOnCell('G', 0, GRow, GCol, gridrows, gridcols, fontsize);

for i=1:length(rows),
    DrawActionOnCell(acts(i), rows(i), cols(i), gridrows, gridcols, fontsize);
end

for i=1:gridcols,
    [xc, yc] = FindColBaseCenter(i, gridrows, gridcols);
    text(xc, yc, sprintf('%d',windpowers(i)), 'FontSize', fontsize, 'Rotation', 0);
end



function DrawEpisodeState(rows, cols, acts, SRow, SCol, GRow, GCol, gridrows, gridcols, fontsize)
DrawGrid(gridrows, gridcols);
DrawTextOnCell('S', 0, SRow, SCol, gridrows, gridcols, fontsize);
DrawTextOnCell('G', 0, GRow, GCol, gridrows, gridcols, fontsize);

for i=1:length(rows),
    DrawActionOnCell(acts(i), rows(i), cols(i), gridrows, gridcols, fontsize);
end



function DrawGrid(gridrows, gridcols)
xsp = 1 / (gridcols + 2);
ysp = 1 / (gridrows + 2);

x = zeros(1, 2*(gridcols + 1));
y = zeros(1, 2*(gridcols + 1));
i = 1;
for xi = xsp:xsp:1 - xsp,
    x(2*i - 1) = xi; x(2*i) = xi;
    if(mod(i , 2) == 0)
        y(2*i - 1) = ysp;y(2*i) = 1-ysp;
    else
        y(2*i - 1) = 1 - ysp;y(2*i) = ysp;
    end
    i = i + 1;
end

x2 = zeros(1, 2*(gridrows + 1));
y2 = zeros(1, 2*(gridrows + 1));
i = 1;
for yi = ysp:ysp:1 - ysp,
    y2(2*i - 1) = yi; y2(2*i) = yi;
    if(mod(i , 2) == 0)
        x2(2*i - 1) = xsp;x2(2*i) = 1-xsp;
    else
        x2(2*i - 1) = 1 - xsp;x2(2*i) = xsp;
    end
    i = i + 1;
end

plot(x, y, '-');
hold on
plot(x2, y2, '-');
axis([0 1 0 1]);
axis off
set(gcf, 'color', 'white');



function DrawTextOnCell(theText, rotation, row, col, gridrows, gridcols, fontsize)
[xc, yc] = FindCellCenter(row, col, gridrows, gridcols);
text(xc, yc, theText,  'FontSize', fontsize, 'Rotation', rotation);







function DrawActionOnCell(actionIndex, row, col, gridrows, gridcols, fontsize)
rotation = 0;
textToDraw = 'o';
switch actionIndex
   case 1 % east
       textToDraw = '\rightarrow';
       rotation = 0;
   case 2 % south
       textToDraw = '\downarrow';
       rotation = 0;
   case 3 % west
       textToDraw = '\leftarrow';
       rotation = 0;
   case 4 % north
       textToDraw = '\uparrow';
       rotation = 0;
   case 5 % northeast 
       textToDraw = '\rightarrow';
       rotation = 45;
   case 6 % southeast 
       textToDraw = '\downarrow';
       rotation = 45;
   case 7 % southwest
       textToDraw = '\leftarrow';
       rotation = 45;
   case 8 % northwest
       textToDraw = '\uparrow';
       rotation = 45;

   otherwise
      disp(sprintf('invalid action index: %d', actionIndex))
end
DrawTextOnCell(textToDraw, rotation,  row, col, gridrows, gridcols, fontsize);




function [x,y] = FindCellCenter(row, col, gridrows, gridcols)
xsp = 1 / (gridcols + 2);
ysp = 1 / (gridrows + 2);
x = ((2*col + 1) / 2) * xsp;
y = 1 - (((2*row + 1) / 2) * ysp);
x = x - xsp/5;



function [x,y] = FindColBaseCenter(col, gridrows, gridcols)
row = gridrows + 1;
xsp = 1 / (gridcols + 2);
ysp = 1 / (gridrows + 2);
x = ((2*col + 1) / 2) * xsp;
y = 1 - (((2*row + 1) / 2) * ysp);
x = x - xsp/5;

Answer 1

对于风，只需生成一个随机数n，比如介于0和1之间。如果你想要3个不同的行为，每个行为的概率为1/3，那么只需要n＆lt; .33，.33＆lt; n＆lt; .66 ......等。

我不太明白你在墙上说的是什么，但你应该检查代理人将采取的行动以及风对它的影响，然后看看是否会导致你撞墙。如果是这样，采取适当的行动。

使用我的matlab代码强化学习中的Windy gridworld游戏问题的变体

1 个答案: