Tensorflow JS,自定义损失函数,将各个部分放在一起

时间:2018-07-19 20:30:49

标签: javascript tensorflow lstm reinforcement-learning tensorflow.js

我想训练一个模型来玩游戏。我已经从像素示例中了解了Ping以进行强化学习并基于我的代码。

但是,与该示例相反,在我的游戏中,无法从单个帧中预测最佳移动。它更像是一种扑克游戏,您需要考虑以前的举动。这就是为什么我选择带有LSTM的模型的原因,就像在生成文本的教程中一样。

我想出了以下代码,但是很难将各个部分放在一起:

/* This function should be fine*/
function createModel(lstmLayerSizes) {
  if (!Array.isArray(lstmLayerSizes)) {
    lstmLayerSizes = [lstmLayerSizes];
  }

  _model = tf.sequential();
  for (let i = 0; i < lstmLayerSizes.length; ++i) {
    const lstmLayerSize = lstmLayerSizes[i];
    _model.add(tf.layers.lstm({
      units: lstmLayerSize,
      returnSequences: i < lstmLayerSizes.length - 1,
      inputShape: i === 0 ? [_sampleLength, _indicatorCount] : undefined
    }));
  }
  _model.add(
    tf.layers.dense({
      units: numberOfActions,
      activation: 'softmax'
    }));
}

function compileModel(learningRate) {
  _optimizer = tf.train.rmsprop(learningRate);
  _model.compile({
    optimizer: _optimizer,
    loss: myLossFunction
  });
}

/* Should only have 2 parameters */
function myLossFunction(actions, labels, rewards) {
  var tsActions = tf.oneHot(actions, 3);
  var tsActionsFloat32 = tf.cast(tsActions, 'float32');
  var cross_entropies = tf.losses.softmaxCrossEntropy(tsActionsFloat32, labels);
  var loss = tf.sum(tf.mul(rewards, cross_entropies));
  return loss;
}

async function train(game) {
  const numEpochs = 10;
  const numOfSteps = 50;

  var observations = [];
  var predictions = [];
  var actions = [];

  compileModel(0.01);
  for (let i = 0; i < numEpochs; ++i) {
    console.log(`epoch ${i}, start`)
    var step = game.init();
    for (let s = 0; s < numOfSteps; s++) {
      var observation = step.context;
      var prediction = predict(observation);
      var action = sampleFromProbability(prediction);
      step = game.doAction(action);

      observations.push([observation]);
      predictions = predictions.concat(Array.from(prediction.dataSync()));
      actions.push(action);
    }

    var rewards = calculateRewards(step, referenceFrame, numOfSteps);
    var tsActions = tf.tensor1d(actions);
    var tsPredictions = tf.tensor2d(predictions, [50, 3], 'float32');
    var tsRewards = tf.tensor1d(rewards);
    var tsObservations = tf.tensor3d(observations, [50, 1, 5])


    /* Gives a runtime error saying no variables can be found */
    //_optimizer.minimize(() => { return myLossFunction(actions, tsPredictions, rewards); } );

    /* invallid loss function, and should take the rewards into account */
    //await _model.fit(tsObservations, tsPredictions);

    console.log(`epoch ${i}, stop`)

    actions = [];
    predictions = [];
    rewards = [];
  }
}

关于如何进行的任何建议?

1 个答案:

答案 0 :(得分:0)

我认为您最好的选择是使用演员-评论家网络。我也在尝试类似的实现。 您可以在这里查看: https://sergiuionescu.github.io/esp32-auto-car/sym/sym.html