所以我使用以下代码在Unity中实现Q-learning:
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using UnityEngine;
namespace QLearner
{
public class QLearnerScript
{
List<float[]> QStates; // Q states over time
List<float[]> QActions; // Q actions over time
float[] initialState;
int initialActionIndex;
float[] outcomeState;
float outcomeActionValue;
bool firstIteration;
int possibleActions;
float learningRate; // denoted by alpha
float discountFactor; // denoted by gamma
float simInterval;
System.Random r = new System.Random();
public int main(float[] currentState, float reward)
{
QLearning(currentState, reward);
// Applies a sim interval and rounds
initialState = new float[2] {(float)Math.Round((double)currentState[0] / simInterval) * simInterval , (float)Math.Round((double)currentState[1] / simInterval) * simInterval};
firstIteration = false;
int actionIndex = r.Next(0, possibleActions);
bool exists = false;
if(QStates.Count > 0)
{
for(int i = 0; i < QStates.Count; i++)
{
float[] state = QStates.ElementAt(i);
float[] actions = QActions.ElementAt(i);
if(state[0] == initialState[0] && state[1] == initialState[1])
{
exists = true;
initialActionIndex = Array.IndexOf(actions, MaxFloat(actions));
return initialActionIndex;
}
}
}
if(!exists)
{
float[] actionVals = new float[possibleActions];
for (int i = 0; i < possibleActions; i++)
{
actionVals[i] = 0f;
}
QStates.Add( initialState);
QActions.Add(actionVals);
}
initialActionIndex = actionIndex;
return initialActionIndex;
}
public QLearnerScript(int possActs)
{
QStates = new List<float[]>();
QActions = new List<float[]>();
possibleActions = possActs;
learningRate = .5f; // Between 0 and 1
discountFactor = 1f;
simInterval = 1f;
firstIteration = true;
}
public void QLearning(float[] outcomeStateFeed, float reward)
{
if(!firstIteration)
{
outcomeState = new float[2] {(float)Math.Round((double)outcomeStateFeed[0] / simInterval) * simInterval , (float)Math.Round((double)outcomeStateFeed[1] / simInterval) * simInterval};
bool exists = false;
for(int i = 0; i < QStates.Count; i++)
{
float[] state = QStates.ElementAt(i);
float[] actions = QActions.ElementAt(i);
if(state[0] == outcomeState[0] && state[1] == outcomeState[1])
{
exists = true;
outcomeActionValue = MaxFloat(actions);
}
}
for(int i = 0; i < QStates.Count; i++)
{
float[] state = QStates.ElementAt(i);
float[] actions = QActions.ElementAt(i);
if(state[0] == initialState[0] && state[1] == initialState[1])
{
if(exists)
{
actions[initialActionIndex] += learningRate * (reward + discountFactor * outcomeActionValue - actions[initialActionIndex]);
}
if(!exists)
{
actions[initialActionIndex] += learningRate * (reward + discountFactor * 0f - actions[initialActionIndex]);
}
}
}
}
}
public int getQtableCount()
{
return QStates.Count;
}
float MaxFloat(float[] numbers)
{
float max = numbers[0];
for (int i = 0; i < numbers.Length; i++)
if (max < numbers[i])
{
max = numbers[i];
}
return max;
}
}
}
哪种方式适用于我的环境。但是,我也试图实施SARSA,以便相互测试这两种算法。我知道Q-learning是非政策性的,而SARSA是政策性的,这意味着我必须实施一项政策来获取下一步行动而不是简单地调用
MaxFloat(actions)
然而,实际的实现会让我感到困惑,我如何修改我的脚本以包含此策略?
答案 0 :(得分:0)
使用SARSA,算法的名称也是算法:保存状态,动作,奖励,下一个状态和动作,然后使用该信息执行更新。
当您不仅具有当前状态和奖励时,您需要计算更新,但是在您拥有先前状态,前一状态的奖励和当前状态时。 SARSA将使用当前状态,而Q-Learning将用贪婪政策的预测取代它。