检查群集工作人员意外停止或自杀

时间:2016-12-19 14:27:25

标签: node.js multithreading memory memory-leaks cluster-computing

我使用suicideexitedAfterDisconnect来区分意外死亡的过程:

cluster.on('disconnect', function(worker) {
    console.log('' + worker.id + ' disconnect, restart now');
    cluster.fork();
});
cluster.on('exit', function(worker) {
    //if accidental exit
    //是否意外退出
    if (worker.exitedAfterDisconnect || worker.suicide) {
        console.info('process exit by kill');
    } else {
        console.info('process exit by accident');
        cluster.fork();
    }
    console.info('process exit');
});

原因我手动断开工作者并分叉新工作者。因此,我必须区分工人被意外或逻辑停止。

但我发现,即使我用cluster.worker.disconnect()手动区分它,worker.exitedAfterDisconnectworker.suicide仍然是假的或未定义的!

我的节点版本为4.6.2

然后,我通过在断开连接回调中添加一行来修改我的代码:

cluster.on('disconnect', function(worker) {
    console.log('' + worker.id + ' disconnect, restart now');
    worker.isSuicide = true;
    cluster.fork();
});
cluster.on('exit', function(worker) {
    //if accidental exit
    //是否意外退出
    if (worker.isSuicide) {
        console.info('process exit by kill');
    } else {
        console.info('process exit by accident');
        cluster.fork();
    }
    console.info('process exit');
});

我将isSuicide标志添加到worker对象,并在退出回调中检查它。

它似乎有用,但我不知道这是否是一个很好的解决方案,我不知道为什么我无法获得正确的worker.exitedAfterDisconnectworker.suicide状态。

我可以在下面发布我的图书馆。我想要做的只是在内存限制超过时重启过程:

var cluster = require('cluster');
var usage = require('usage');
var os = require('os');

var CPU_COUNT = process.env.CPU_COUNT;
var CHECK_INTERVAL = process.env.CHECK_INTERVAL;

var cpuCount = CPU_COUNT || os.cpus().length;
var checkInterval = CHECK_INTERVAL || 5000;

module.exports = {
    run: function(bytes, runFunc, cleanFunc) {

        if (cluster.isMaster) {
            for (var i = 0; i < cpuCount; i++) {
                cluster.fork();
            }
            cluster.on('disconnect', function(worker) {
                console.log('' + worker.id + ' disconnect, restart now');
                worker.isSuicide = true;
                cluster.fork();
            });
            cluster.on('exit', function(worker) {
                //if accidental exit
                //是否意外退出
                if (worker.isSuicide) {
                    console.info('process exit by kill');
                } else {
                    console.info('process exit by accident');
                    cluster.fork();
                }
                console.info('process exit');
            });
        } else {

            runFunc && runFunc();

            var killing = false;
            setInterval(function() {

                usage.lookup(process.pid, function(err, result) {

                    if (result === null || result === undefined) {
                        console.log("memory check fail");
                        return;
                    }
                    if (parseInt(result.memory) > bytes && killing === false) {
                        console.log("memory exceed, start to kill");

                        //stop process
                        //杀死进程
                        var killtimer = setTimeout(function() {
                            console.info("process down!")
                            process.exit(1);
                        }, 5000);

                        //prevent no quit,https://cnodejs.org/topic/570924d294b38dcb3c09a7a0
                        //防止阻塞
                        killtimer.unref();

                        cleanFunc && cleanFunc();

                        try {
                            if (['disconnected', 'dead'].indexOf(cluster.workder.state) < 0) {
                                cluster.worker.disconnect();
                            }
                        } catch (err) {};

                        killing = true;
                    }
                });

            }, checkInterval);
        }

    }
}

致电示例:

var memory = require('./memory');

//100M LIMIT
memory.run(100000000, function() {
    require('./server.js');
}, function() {
    console.info('clean now!');
});

1 个答案:

答案 0 :(得分:0)

我知道这有点晚了,但当我在搜索谷歌寻求帮助时,我发现这是一个顶级链接。因为我想出了如何在不设置自定义标志window.isSuicide的情况下这样做,我想我会分享。

由于每个worker实际上是一个单独的线程,在os级别上有一个单独的pid,当worker死掉时它有一个退出代码。当工人死亡时,它将按预期退出代码1。如果它平静地死亡,它的退出代码为0,因此只需检查退出代码。

示例代码:

var cluster = require('cluster');

// Code to run if we're in the master process
if (cluster.isMaster) {
  startWorkers();
// Code to run if we're in a worker process
} else {
  // For example if the first worker dies of an Exception
  if (cluster.worker.id == 1) {
    throw new Exception("Oh no");
  // Or just exit with an error
  } else if (cluster.worker.id == 2) {
    process.exit(1);
  }
  process.exit();
  // You can also use the following, but they are identical:
  //process.exit(0);
}

function startWorkers () {
  var workerCount = 2;

  for (var i = 0; i < 2; i++) {
      cluster.fork();
  }

  cluster.on('exit', function (worker) {
    if (worker['process']['exitCode'] === 0) {
      console.log("Worker %d died peacefully...", worker.id);
    } else {
      console.log("Worker %d died with exit code %d, restarting it", worker.id, worker['process']['exitCode']);
      cluster.fork();
    }
  });
}

将具有以下控制台输出:

/tmp/tmp.js:10
    throw new Exception("Oh no");
              ^

ReferenceError: Exception is not defined
    at Object.<anonymous> (/tmp/tmp.js:10:15)
    at Module._compile (module.js:409:26)
    at Object.Module._extensions..js (module.js:416:10)
    at Module.load (module.js:343:32)
    at Function.Module._load (module.js:300:12)
    at Function.Module.runMain (module.js:441:10)
    at startup (node.js:140:18)
    at node.js:1043:3
Worker 2 died with exit code 1, restarting it
Worker 1 died with exit code 1, restarting it
Worker 3 died peacefully...
Worker 4 died peacefully...