我想训练一个模型来玩游戏。我已经从像素示例中了解了Ping以进行强化学习并基于我的代码。
但是,与该示例相反,在我的游戏中,无法从单个帧中预测最佳移动。它更像是一种扑克游戏,您需要考虑以前的举动。这就是为什么我选择带有LSTM
的模型的原因,就像在生成文本的教程中一样。
我想出了以下代码,但是很难将各个部分放在一起:
/* This function should be fine*/
function createModel(lstmLayerSizes) {
if (!Array.isArray(lstmLayerSizes)) {
lstmLayerSizes = [lstmLayerSizes];
}
_model = tf.sequential();
for (let i = 0; i < lstmLayerSizes.length; ++i) {
const lstmLayerSize = lstmLayerSizes[i];
_model.add(tf.layers.lstm({
units: lstmLayerSize,
returnSequences: i < lstmLayerSizes.length - 1,
inputShape: i === 0 ? [_sampleLength, _indicatorCount] : undefined
}));
}
_model.add(
tf.layers.dense({
units: numberOfActions,
activation: 'softmax'
}));
}
function compileModel(learningRate) {
_optimizer = tf.train.rmsprop(learningRate);
_model.compile({
optimizer: _optimizer,
loss: myLossFunction
});
}
/* Should only have 2 parameters */
function myLossFunction(actions, labels, rewards) {
var tsActions = tf.oneHot(actions, 3);
var tsActionsFloat32 = tf.cast(tsActions, 'float32');
var cross_entropies = tf.losses.softmaxCrossEntropy(tsActionsFloat32, labels);
var loss = tf.sum(tf.mul(rewards, cross_entropies));
return loss;
}
async function train(game) {
const numEpochs = 10;
const numOfSteps = 50;
var observations = [];
var predictions = [];
var actions = [];
compileModel(0.01);
for (let i = 0; i < numEpochs; ++i) {
console.log(`epoch ${i}, start`)
var step = game.init();
for (let s = 0; s < numOfSteps; s++) {
var observation = step.context;
var prediction = predict(observation);
var action = sampleFromProbability(prediction);
step = game.doAction(action);
observations.push([observation]);
predictions = predictions.concat(Array.from(prediction.dataSync()));
actions.push(action);
}
var rewards = calculateRewards(step, referenceFrame, numOfSteps);
var tsActions = tf.tensor1d(actions);
var tsPredictions = tf.tensor2d(predictions, [50, 3], 'float32');
var tsRewards = tf.tensor1d(rewards);
var tsObservations = tf.tensor3d(observations, [50, 1, 5])
/* Gives a runtime error saying no variables can be found */
//_optimizer.minimize(() => { return myLossFunction(actions, tsPredictions, rewards); } );
/* invallid loss function, and should take the rewards into account */
//await _model.fit(tsObservations, tsPredictions);
console.log(`epoch ${i}, stop`)
actions = [];
predictions = [];
rewards = [];
}
}
关于如何进行的任何建议?
答案 0 :(得分:0)
我认为您最好的选择是使用演员-评论家网络。我也在尝试类似的实现。 您可以在这里查看: https://sergiuionescu.github.io/esp32-auto-car/sym/sym.html