我试图将Q-learning实施到我写过的简单游戏中。游戏的基础是玩家必须跳过"跳跃"避免迎面而来的盒子。
我设计了两个动作的系统; jump
和do_nothing
,状态是距离下一个区块的距离(划分和覆盖以确保没有大量状态)。
我的问题似乎是我的算法实现并未考虑"未来的奖励",因此它最终会在错误的时间跳跃。
这是我对Q学习算法的实现;
JumpGameAIClass.prototype.getQ = function getQ(state) {
if (!this.Q.hasOwnProperty(state)) {
this.Q[state] = {};
for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
var action = this.actions[actionIndex];
this.Q[state][action] = 0;
}
}
return this.Q[state];
};
JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
var closest = -1;
for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
var block = this.blocks[blockIndex];
var distance = block.x - this.playerX;
if (distance >= 0 && (closest === -1 || distance < closest)) {
closest = distance;
}
}
return Math.max(0, Math.floor(closest * this.resolution));
};
JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
var jumpReward = this.getQ(distance)[this.actions[0]];
var doNothingReward = this.getQ(distance)[this.actions[1]];
if (jumpReward > doNothingReward) {
return this.actions[0];
} else if (doNothingReward > jumpReward) {
return this.actions[1];
} else {
if (!this.canJump()) {
return this.actions[1];
}
return this.actions[Math.floor(Math.random() * this.actions.length)];
}
};
JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
// We can't jump while in mid-air
if (!this.canJump()) {
return this.actions[1];
}
if (Math.random() < this.epsilon) {
return this.actions[Math.floor(Math.random() * this.actions.length)];
} else {
return this.getActionWithHighestQ(this.getBlockDistance());
}
};
JumpGameAIClass.prototype.think = function think() {
var reward = this.liveReward;
if (this.score !== this.lastScore) {
this.lastScore = this.score;
reward = this.scoreReward;
} else if (!this.playerAlive) {
reward = this.deathReward;
}
this.drawDistance();
var distance = this.getBlockDistance(),
maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],
previousQ = this.getQ(this.lastDistance)[this.lastAction];
this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);
this.lastAction = this.getActionEpsilonGreedy();
this.lastDistance = distance;
switch (this.lastAction) {
case this.actions[0]:
this.jump();
break;
}
};
以下是它使用的一些属性:
epsilon: 0.05,
alpha: 1,
gamma: 1,
resolution: 0.1,
actions: [ 'jump', 'do_nothing' ],
Q: {},
liveReward: 0,
scoreReward: 100,
deathReward: -1000,
lastAction: 'do_nothing',
lastDistance: 0,
lastScore: 0
我不得不使用lastAction / lastDistance来计算Q,因为我无法使用当前数据(将会对之前在帧中执行的操作起作用)。
在所有渲染和游戏内容完成后,每帧都会调用think
方法(物理,控制,死亡等)。
var JumpGameAIClass = function JumpGame(canvas) {
Game.JumpGame.call(this, canvas);
Object.defineProperties(this, {
epsilon: {
value: 0.05
},
alpha: {
value: 1
},
gamma: {
value: 1
},
resolution: {
value: 0.1
},
actions: {
value: [ 'jump', 'do_nothing' ]
},
Q: {
value: { },
writable: true
},
liveReward: {
value: 0
},
scoreReward: {
value: 100
},
deathReward: {
value: -1000
},
lastAction: {
value: 'do_nothing',
writable: true
},
lastDistance: {
value: 0,
writable: true
},
lastScore: {
value: 0,
writable: true
}
});
};
JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);
JumpGameAIClass.prototype.getQ = function getQ(state) {
if (!this.Q.hasOwnProperty(state)) {
this.Q[state] = {};
for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
var action = this.actions[actionIndex];
this.Q[state][action] = 0;
}
}
return this.Q[state];
};
JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
var closest = -1;
for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
var block = this.blocks[blockIndex];
var distance = block.x - this.playerX;
if (distance >= 0 && (closest === -1 || distance < closest)) {
closest = distance;
}
}
return Math.max(0, Math.floor(closest * this.resolution));
};
JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
var jumpReward = this.getQ(distance)[this.actions[0]];
var doNothingReward = this.getQ(distance)[this.actions[1]];
if (jumpReward > doNothingReward) {
return this.actions[0];
} else if (doNothingReward > jumpReward) {
return this.actions[1];
} else {
if (!this.canJump()) {
return this.actions[1];
}
return this.actions[Math.floor(Math.random() * this.actions.length)];
}
};
JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
if (!this.canJump()) {
return this.actions[1];
}
if (Math.random() < this.epsilon) {
return this.actions[Math.floor(Math.random() * this.actions.length)];
} else {
return this.getActionWithHighestQ(this.getBlockDistance());
}
};
JumpGameAIClass.prototype.onDeath = function onDeath() {
this.restart();
};
JumpGameAIClass.prototype.think = function think() {
var reward = this.liveReward;
if (this.score !== this.lastScore) {
this.lastScore = this.score;
reward = this.scoreReward;
} else if (!this.playerAlive) {
reward = this.deathReward;
}
this.drawDistance();
var distance = this.getBlockDistance(),
maxQ = this.getQ(distance)[this.getActionWithHighestQ(distance)],
previousQ = this.getQ(this.lastDistance)[this.lastAction];
this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);
this.lastAction = this.getActionEpsilonGreedy();
this.lastDistance = distance;
switch (this.lastAction) {
case this.actions[0]:
this.jump();
break;
}
};
JumpGameAIClass.prototype.drawDistance = function drawDistance() {
this.context.save();
this.context.textAlign = 'center';
this.context.textBaseline = 'bottom';
this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);
this.context.textBaseline = 'top';
this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);
this.context.restore();
};
JumpGameAIClass.prototype.onFrame = function onFrame() {
Game.JumpGame.prototype.onFrame.apply(this, arguments);
this.think();
}
Game.JumpGameAI = JumpGameAIClass;
&#13;
body {
background-color: #EEEEEE;
text-align: center;
}
canvas#game {
background-color: #FFFFFF;
border: 1px solid #DDDDDD;
}
&#13;
<!DOCTYPE HTML>
<html lang="en">
<head>
<title>jump</title>
</head>
<body>
<canvas id="game" width="512" height="512">
<h1>Your browser doesn't support canvas!</h1>
</canvas>
<script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>
<!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script>
</body>
&#13;
答案 0 :(得分:5)
你基本上有简化版:
我使用了值:
epsilon: {
value: 0.01
},
alpha: {
value: 0.7
},
gamma: {
value: 0.9
},
resolution: {
value: 0.1
},
liveReward: {
value: 10
},
scoreReward: {
value: -100
},
deathReward: {
value: 1000
},
在前20次尝试中没有超过100的麻烦。
Q学习可以用时态逻辑来描述
Q(s, a)=r(s,a)+gamma*max_a'(Q(s', a'))
哪里
r(s,a)
= r
=立即奖励gamma
=延迟与即时奖励的相对值(0到1)s'
=行动后的新状态a
a
=状态s
a'
=状态s'
您应该将其作为
执行选择一个动作并执行它
答案 1 :(得分:2)
你的算法实现很好,只需要调整一些参数。
如果您为生活分配一些奖励,在我的示例中为10,并将epsilon设置为0,您将获得一个获胜的AI。
示例:
var JumpGameAIClass = function JumpGame(canvas) {
Game.JumpGame.call(this, canvas);
Object.defineProperties(this, {
epsilon: {
value: 0
},
alpha: {
value: 1
},
gamma: {
value: 1
},
resolution: {
value: 0.1
},
actions: {
value: [ 'jump', 'do_nothing' ]
},
Q: {
value: { },
writable: true
},
liveReward: {
value: 0
},
scoreReward: {
value: 100
},
deathReward: {
value: -1000
},
lastAction: {
value: 'do_nothing',
writable: true
},
lastDistance: {
value: 0,
writable: true
},
lastScore: {
value: 0,
writable: true
}
});
};
JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);
JumpGameAIClass.prototype.getQ = function getQ(state) {
if (!this.Q.hasOwnProperty(state)) {
this.Q[state] = {};
for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
var action = this.actions[actionIndex];
this.Q[state][action] = 0;
}
}
return this.Q[state];
};
JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
var closest = -1;
for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
var block = this.blocks[blockIndex];
var distance = block.x - this.playerX;
if (distance >= 0 && (closest === -1 || distance < closest)) {
closest = distance;
}
}
return Math.max(0, Math.floor(closest * this.resolution));
};
JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
var jumpReward = this.getQ(distance)[this.actions[0]];
var doNothingReward = this.getQ(distance)[this.actions[1]];
if (!this.canJump()) {
return this.actions[1];
} else if (jumpReward > doNothingReward) {
return this.actions[0];
} else if (doNothingReward > jumpReward) {
return this.actions[1];
} else {
return this.actions[Math.floor(Math.random() * this.actions.length)];
}
};
JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
if (!this.canJump()) {
return this.actions[1];
}
if (Math.random() < this.epsilon) {
return this.actions[Math.floor(Math.random() * this.actions.length)];
} else {
return this.getActionWithHighestQ(this.getBlockDistance());
}
};
JumpGameAIClass.prototype.onDeath = function onDeath() {
this.restart();
};
JumpGameAIClass.prototype.think = function think() {
var reward = this.liveReward;
if (this.score !== this.lastScore) {
this.lastScore = this.score;
reward = this.scoreReward;
} else if (!this.playerAlive) {
reward = this.deathReward;
}
this.drawDistance();
var distance = this.getBlockDistance(),
maxQ = this.playerAlive ? this.getQ(distance)[this.getActionWithHighestQ(distance)] : 0,
previousQ = this.getQ(this.lastDistance)[this.lastAction];
this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);
this.lastAction = this.getActionEpsilonGreedy();
this.lastDistance = distance;
switch (this.lastAction) {
case this.actions[0]:
this.jump();
break;
}
};
JumpGameAIClass.prototype.drawDistance = function drawDistance() {
this.context.save();
this.context.textAlign = 'center';
this.context.textBaseline = 'bottom';
this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);
this.context.textBaseline = 'top';
this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);
this.context.restore();
};
JumpGameAIClass.prototype.onFrame = function onFrame() {
Game.JumpGame.prototype.onFrame.apply(this, arguments);
this.think();
}
Game.JumpGameAI = JumpGameAIClass;
body {
background-color: #EEEEEE;
text-align: center;
}
canvas#game {
background-color: #FFFFFF;
border: 1px solid #DDDDDD;
}
<!DOCTYPE HTML>
<html lang="en">
<head>
<title>jump</title>
</head>
<body>
<canvas id="game" width="512" height="512">
<h1>Your browser doesn't support canvas!</h1>
</canvas>
<script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>
<!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script>
</body>
更新
对此有一点想法,虽然我的例子看起来有效,但这是不正确的。
讨厌的是,因为跳跃的结果直到未来的多次迭代才知道,为生活分配直接奖励将导致在每个州首先做出的任何随机决定重复,直到最终结果该决定通过各州传播回来。
根据游戏的物理特性,玩家的跳跃距离小于块间距,这意味着清除一个块的跳跃将比下一个块的起点更远离下一个块,因此同样的跳跃可以再做一次。因此,如果在第一个块之前进行“良好”跳转,系统将立即收敛到成功的模式。如果游戏的物理特性不同或者发生“坏”跳跃,那么这个AI可能无法自我纠正。
问题是系统实际上有两个部分,即blockDistance和playerY。如果不在决策中包含playerY状态,则跳转的结果无法正确传播回原点。
你可以在这个简单的游戏中通过偏见决策来解决这个问题,不采取任何行动。由于基于距离的决策状态是完整的,只要你不跳跃,不跳跃的结果,即死亡,将正确地传播回决定,不会在每个距离跳跃。它仍然有点时髦,因为一旦你跳过奖励的传播将是不正确的,但你现在可以看到它学习所有相同。
示例:
var JumpGameAIClass = function JumpGame(canvas) {
Game.JumpGame.call(this, canvas);
Object.defineProperties(this, {
epsilon: {
value: 0
},
alpha: {
value: 1
},
gamma: {
value: 1
},
resolution: {
value: 0.1
},
actions: {
value: [ 'jump', 'do_nothing' ]
},
Q: {
value: { },
writable: true
},
liveReward: {
value: 10
},
scoreReward: {
value: 100
},
deathReward: {
value: -1000
},
lastAction: {
value: 'do_nothing',
writable: true
},
lastDistance: {
value: 0,
writable: true
},
lastScore: {
value: 0,
writable: true
}
});
};
JumpGameAIClass.prototype = Object.create(Game.JumpGame.prototype);
JumpGameAIClass.prototype.getQ = function getQ(state) {
if (!this.Q.hasOwnProperty(state)) {
this.Q[state] = {};
for (var actionIndex = 0; actionIndex < this.actions.length; actionIndex++) {
var action = this.actions[actionIndex];
this.Q[state][action] = 0;
}
}
return this.Q[state];
};
JumpGameAIClass.prototype.getBlockDistance = function getBlockDistance() {
var closest = -1;
for (var blockIndex = 0; blockIndex < this.blocks.length; blockIndex++) {
var block = this.blocks[blockIndex];
var distance = block.x - this.playerX;
if (distance >= 0 && (closest === -1 || distance < closest)) {
closest = distance;
}
}
return Math.max(0, Math.floor(closest * this.resolution));
};
JumpGameAIClass.prototype.getActionWithHighestQ = function getActionWithHighestQ(distance) {
var jumpReward = this.getQ(distance)[this.actions[0]];
var doNothingReward = this.getQ(distance)[this.actions[1]];
if (!this.canJump() || doNothingReward >= jumpReward) {
return this.actions[1];
} else {
return this.actions[0];
}
};
JumpGameAIClass.prototype.getActionEpsilonGreedy = function getActionEpsilonGreedy() {
if (!this.canJump()) {
return this.actions[1];
}
if (Math.random() < this.epsilon) {
return this.actions[Math.floor(Math.random() * this.actions.length)];
} else {
return this.getActionWithHighestQ(this.getBlockDistance());
}
};
JumpGameAIClass.prototype.onDeath = function onDeath() {
this.restart();
};
JumpGameAIClass.prototype.think = function think() {
var reward = this.liveReward;
if (this.score !== this.lastScore) {
this.lastScore = this.score;
reward = this.scoreReward;
} else if (!this.playerAlive) {
reward = this.deathReward;
}
this.drawDistance();
var distance = this.getBlockDistance(),
maxQ = this.playerAlive ? this.getQ(distance)[this.getActionWithHighestQ(distance)] : 0,
previousQ = this.getQ(this.lastDistance)[this.lastAction];
this.getQ(this.lastDistance)[this.lastAction] = previousQ + this.alpha * (reward + (this.gamma * maxQ) - previousQ);
this.lastAction = this.getActionEpsilonGreedy();
this.lastDistance = distance;
switch (this.lastAction) {
case this.actions[0]:
this.jump();
break;
}
};
JumpGameAIClass.prototype.drawDistance = function drawDistance() {
this.context.save();
this.context.textAlign = 'center';
this.context.textBaseline = 'bottom';
this.context.fillText('Distance: ' + this.getBlockDistance(), this.canvasWidth / 2, this.canvasHeight / 4);
this.context.textBaseline = 'top';
this.context.fillText('Last Distance: ' + this.lastDistance, this.canvasWidth / 2, this.canvasHeight / 4);
this.context.restore();
};
JumpGameAIClass.prototype.onFrame = function onFrame() {
Game.JumpGame.prototype.onFrame.apply(this, arguments);
this.think();
}
Game.JumpGameAI = JumpGameAIClass;
body {
background-color: #EEEEEE;
text-align: center;
}
canvas#game {
background-color: #FFFFFF;
border: 1px solid #DDDDDD;
}
<!DOCTYPE HTML>
<html lang="en">
<head>
<title>jump</title>
</head>
<body>
<canvas id="game" width="512" height="512">
<h1>Your browser doesn't support canvas!</h1>
</canvas>
<script src="https://raw.githubusercontent.com/cagosta/requestAnimationFrame/master/app/requestAnimationFrame.js"></script>
<!-- https://gist.github.com/jackwilsdon/d06bffa6b32c53321478 -->
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/4e467f82590e76543bf55ff788504e26afc3d694/game.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2b7ce2c3dd268c4aef9ad27316edb0b235ad0d06/canvasgame.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/2696c72e001e48359a6ce880f1c475613fe359f5/jump.js"></script>
<script src="https://cdn.rawgit.com/jackwilsdon/d06bffa6b32c53321478/raw/249c92f3385757b6edf2ceb49e26f14b89ffdcfe/bootstrap.js"></script>
</body>