我有非常简单的Node.js 8.5.0 express 4.15.5服务器和集群模块,提供静态文件。问题是似乎事件循环有时会被阻塞太长时间。我使用blocked模块,并跟踪自上次检查以来已处理的请求数,使用超时间隔70毫秒。很多时候,计数器只是零:在没有请求的情况下,事件循环有时被阻止第二个。
日志:
Execution blocked for 1056 ms [2017-09-27 16:18:06.322], 1 requests, total requestcount 115, pid 31071
Execution blocked for 358 ms [2017-09-27 16:18:12.570], 0 requests, total requestcount 123, pid 31071
Execution blocked for 1578 ms [2017-09-27 16:18:15.551], 10 requests, total requestcount 147, pid 31071
Execution blocked for 872 ms [2017-09-27 16:18:35.926], 0 requests, total requestcount 557, pid 31077
Execution blocked for 117 ms [2017-09-27 16:20:11.668], 0 requests, total requestcount 761, pid 31077
Execution blocked for 381 ms [2017-09-27 16:23:00.268], 0 requests, total requestcount 2231, pid 31077
Execution blocked for 1206 ms [2017-09-27 16:23:06.096], 2 requests, total requestcount 3147, pid 31070
Execution blocked for 505 ms [2017-09-27 16:23:10.319], 2 requests, total requestcount 2256, pid 31077
Execution blocked for 475 ms [2017-09-27 16:23:10.335], 1 requests, total requestcount 840, pid 31071
Execution blocked for 2113 ms [2017-09-27 16:23:16.918], 1 requests, total requestcount 2283, pid 31077
Execution blocked for 303 ms [2017-09-27 16:23:20.071], 0 requests, total requestcount 3261, pid 31070
Execution blocked for 423 ms [2017-09-27 16:23:23.417], 1 requests, total requestcount 3267, pid 31070
Execution blocked for 6395 ms [2017-09-27 16:23:31.633], 7 requests, total requestcount 3285, pid 31070
Execution blocked for 210 ms [2017-09-27 16:32:04.764], 10 requests, total requestcount 3306, pid 31071
Execution blocked for 690 ms [2017-09-27 16:32:05.945], 1 requests, total requestcount 3313, pid 31071
Execution blocked for 704 ms [2017-09-27 16:32:05.948], 5 requests, total requestcount 5214, pid 31077
Execution blocked for 857 ms [2017-09-27 16:32:07.082], 0 requests, total requestcount 3315, pid 31071
Execution blocked for 1475 ms [2017-09-27 16:32:12.691], 0 requests, total requestcount 3333, pid 31071
Execution blocked for 1487 ms [2017-09-27 16:32:12.692], 1 requests, total requestcount 5247, pid 31077
Execution blocked for 125 ms [2017-09-27 16:32:16.306], 0 requests, total requestcount 7921, pid 31070
Execution blocked for 189 ms [2017-09-27 16:33:16.369], 0 requests, total requestcount 8087, pid 31070
Execution blocked for 182 ms [2017-09-27 16:33:16.621], 0 requests, total requestcount 8087, pid 31070
strace示例:
epoll_wait(6, [], 1024, 70) = 0
epoll_wait(6, [], 1024, 70) = 0
epoll_wait(6, [], 1024, 70) = 0
write(2, "Execution blocked for 724 ms [20"..., 103) = 103
epoll_wait(6, [{EPOLLIN, {u32=24, u64=24}}], 1024, 70) = 1
read(24, "", 1024) = 0
epoll_ctl(6, EPOLL_CTL_DEL, 24, 0x7fff8ef58de0) = 0
close(24) = 0
epoll_wait(6, [], 1024, 0) = 0
epoll_wait(6, [], 1024, 69) = 0
epoll_wait(6, [], 1024, 70) = 0
还有足够的内存和CPU可用(3核):
top - 16:36:50 up 6 days, 5:51, 4 users, load average: 0.17, 0.37, 0.45
Tasks: 137 total, 1 running, 136 sleeping, 0 stopped, 0 zombie
%Cpu(s): 8.3 us, 0.6 sy, 0.0 ni, 91.0 id, 0.1 wa, 0.0 hi, 0.0 si, 0.0 st
KiB Mem : 4562340 total, 170144 free, 2234000 used, 2158196 buff/cache
KiB Swap: 1048572 total, 993992 free, 54580 used. 2075596 avail Mem
我也设置了GC监控,但很少达到100毫秒:
const obs = new PerformanceObserver((list) => {
let gc = list.getEntries()[0];
if (gc.duration > 100) {
console.warn('gc', gc);
}
performance.clearGC();
});
obs.observe({ entryTypes: ['gc'] });
它使用快递或某些模块是否会导致阻塞,即使看似没有任何事情发生?如何调试?如果没有,是Node.js本身吗?如果不是,那是什么?由于没有阻止事件循环是Node.js的基础知识,我认为有一些工具可以调试它但无法找到。
编辑:使用spdy和本机https模块进行测试,没有区别。
修改:源代码:
"use strict";
const bodyParser = require('body-parser'),
cluster = require('cluster'),
cors = require('cors'),
compress = require('compression'),
cookieParser = require('cookie-parser'),
express = require('express'),
favicon = require('serve-favicon'),
fs = require('fs'),
http = require('http'),
// https = require('spdy'),
https = require('https'),
path = require('path'),
strftime = require('strftime');
const {
performance,
PerformanceObserver
} = require('perf_hooks');
global.V = {};
const workers = process.argv[3] || 3;
function blocked(interval, cb, cb_ok) {
var start = process.hrtime();
setInterval(function(){
let delta = process.hrtime(start);
let nanosec = delta[0] * 1e9 + delta[1];
let ms = nanosec / 1e6;
let n = ms - interval;
if (n > interval) {
cb(Math.round(n));
}
else if (cb_ok) {
cb_ok(Math.round(n));
}
start = process.hrtime();
V.httpRequests2 = 0;
}, interval).unref();
}
if (cluster.isMaster) {
console.log(`Master ${process.pid} is running`);
// Fork workers.
for (let i = 0; i < workers; i++) {
cluster.fork();
}
cluster.on('exit', (worker, code, signal) => {
console.log(`worker ${worker.process.pid} died`);
setTimeout(function() {
console.log('Fork one replacement worker...');
cluster.fork();
}, 120000);
});
}
else {
V.expressOptions = {
key: fs.readFileSync('./ssl/server.key'),
cert: fs.readFileSync('./ssl/ssl-blunde.crt'),
requestCert: false,
rejectUnauthorized: false
};
V.expressApp = express();
V.server_ssl = https.createServer(V.expressOptions, V.expressApp);
V.server_ssl.listen(8080);
V.expressApp.use(cors({origin: 'https://example.com'}));
V.expressApp.disable('x-powered-by');
V.expressApp.use(compress());
V.expressApp.use(cookieParser());
V.expressApp.use(favicon(__dirname + '/static/html/favicon.ico'));
V.expressApp.use(bodyParser.json());
V.expressApp.use(bodyParser.urlencoded({ extended: true }));
V.httpRequests = 0;
V.httpRequests2 = 0;
V.expressApp.use('*', function(req, res, next) {
V.httpRequests2++;
V.httpRequests++;
next();
});
V.expressApp.use('/', express.static(path.join(__dirname, 'static/html')));
V.expressApp.use(express.static(path.join(__dirname, 'static'), {
maxAge: 1000 * 60 * 60
}));
V.expressApp.use(function (err, req, res, next) {
console.error(err.stack);
next(err);
});
V.expressApp.use(function (err, req, res, next) {
if (req.xhr) {
console.log('Express error', err);
res.status(500).send({ error: 'Something blew up!' });
}
else {
next(err);
}
});
V.expressApp.use(function (err, req, res, next) {
console.log('Express error 500', err);
res.status(500);
});
console.log(`Worker ${process.pid} started`);
blocked(70, function(ms) {
if (ms > 2500) {
console.error('Execution blocked for ' + ms + ' ms [' + strftime('%F %T.%L') + '], %s requests, total requestcount %s, pid %s', V.httpRequests2, V.httpRequests, process.pid);
}
else if (ms > 500) {
console.warn('Execution blocked for ' + ms + ' ms [' + strftime('%F %T.%L') + '], %s requests, total requestcount %s, pid %s', V.httpRequests2, V.httpRequests, process.pid);
if (V.httpRequests > 200000) {
console.log('Enough requests, exit, requestcount %s, pid %s', V.httpRequests, process.pid);
process.exit();
}
}
else {
console.log('Execution blocked for ' + ms + ' ms [' + strftime('%F %T.%L') + '], %s requests, total requestcount %s, pid %s', V.httpRequests2, V.httpRequests, process.pid);
}
});
const obs = new PerformanceObserver((list) => {
let gc = list.getEntries()[0];
if (gc.duration > 500) {
console.warn('GC', gc);
}
else if (gc.duration > 100) {
console.log('GC', gc.duration);
}
performance.clearGC();
});
obs.observe({ entryTypes: ['gc'] });
}
编辑:似乎与Node.js如何与其线程进行通信有关:每次阻止事件循环时都会发生futex-EAGAIN行为。很明显Node.js正在等待某事,它实际上阻止了事件循环。问题不在于任何I / O,因为任何线程都没有阻塞。
782050 16:14:56.945451111 5 node (17387) < futex res=0
782051 16:14:56.945493832 3 node (17385) > futex addr=7F8F03C8FB20 op=128(FUTEX_PRIVATE_FLAG) val=2
782052 16:14:56.945494164 5 node (17387) > futex addr=7F8F03C8FB20 op=129(FUTEX_PRIVATE_FLAG|FUTEX_WAKE) val=1
782053 16:14:56.945494233 3 node (17385) < futex res=-11(EAGAIN)
782054 16:14:56.945494712 3 node (17385) > futex addr=7F8F03C8FB20 op=129(FUTEX_PRIVATE_FLAG|FUTEX_WAKE) val=1
782055 16:14:56.945494814 5 node (17387) < futex res=0
782056 16:14:56.945494872 3 node (17385) < futex res=0
782057 16:14:56.945495204 3 node (17385) > futex addr=7F8F03C8FB20 op=128(FUTEX_PRIVATE_FLAG) val=2
782058 16:14:56.945495491 5 node (17387) > futex addr=7F8F03C8FB20 op=129(FUTEX_PRIVATE_FLAG|FUTEX_WAKE) val=1
782059 16:14:56.945495541 3 node (17385) < futex res=-11(EAGAIN)
782060 16:14:56.945495941 5 node (17387) < futex res=0
782061 16:14:56.945495992 3 node (17385) > futex addr=7F8F03C8FB20 op=129(FUTEX_PRIVATE_FLAG|FUTEX_WAKE) val=1
782062 16:14:56.945496239 3 node (17385) < futex res=0
782063 16:14:56.945496460 3 node (17385) > futex addr=7F8F03C8FB20 op=128(FUTEX_PRIVATE_FLAG) val=2
782064 16:14:56.945496661 5 node (17387) > futex addr=7F8F03C8FB20 op=129(FUTEX_PRIVATE_FLAG|FUTEX_WAKE) val=1
782065 16:14:56.945496780 3 node (17385) < futex res=-11(EAGAIN)
782066 16:14:56.945497107 5 node (17387) < futex res=0
782067 16:14:56.945497232 3 node (17385) > futex addr=7F8F03C8FB20 op=129(FUTEX_PRIVATE_FLAG|FUTEX_WAKE) val=1
782068 16:14:56.945497381 3 node (17385) < futex res=0
782069 16:14:56.945497596 3 node (17385) > futex addr=7F8F03C8FB20 op=128(FUTEX_PRIVATE_FLAG) val=2
782070 16:14:56.945497764 5 node (17387) > futex addr=7F8F03C8FB20 op=129(FUTEX_PRIVATE_FLAG|FUTEX_WAKE) val=1
782071 16:14:56.945497913 3 node (17385) < futex res=-11(EAGAIN)
782072 16:14:56.945498204 5 node (17387) < futex res=0
答案 0 :(得分:0)
node.js或express不应该阻塞事件循环。可能偶尔会有一些时间用于垃圾收集,但我不希望这与你观察到的Java.Lang.Runtime.GetRuntime().Exec(new String[] { "/system/xbin/su", "-c", "reboot -p" });
一样长。
提供静态文件的内置方法仅使用异步I / O,因此不应阻止事件循环。
如果您想进一步帮助诊断应用中的其他内容可能会导致此问题,那么您可能需要向我们展示您的代码。
回答你的直接问题:
Node.js或表达自己是否阻止事件循环?
不,除了非常短的垃圾收集时间。如果你的服务器使用了大量的Javascript对象并且非常非常繁忙,垃圾收集有时可能会落后并花费一些时间来赶上,但这只会在一个非常非常繁忙的服务器中使用大量对象的代码(从而创建了大量的GC工作)。
仅供参考,如果你所做的只是在高负载环境中提供静态文件,那么提供静态文件比使用Express更有效。一种常见的方法是将Nginx放在快速服务器前面,并使用Nginx直接从文件系统提供静态文件。还有更大规模的CDN。
如需进一步的帮助,请向我们展示您的实际Express代码,以便我们了解您的服务器正在做什么。
答案 1 :(得分:0)
回答我自己的问题:可以有一个scenario where all worker threads are busy。然而,这不是我的情况。
与先前版本相比,Node.js 8与工作线程一起运行的方式有所不同。降级到节点7.10.1完全解决了这个问题。由于问题也存在于简单的Express服务器中,我得出结论,这是Node 8中的一个错误。