//http://winhelp2002.mvps.org/hosts.txt
//For puppeteer I read in this host file:
//now we read the host file
var hostFile = fs.readFileSync('hosts.txt', 'utf8').split('\n');
var hosts = {};
for (var i = 0; i < hostFile.length; i++) {
var frags = hostFile[i].split(' ');
if (frags.length > 1 && frags[0] === '0.0.0.0') {
hosts[frags[1].trim()] = true;
}
}
//When loading a page I then filter out requests for these domains (and optionally images):
page.on('request', request => {
var domain = null;
if (task.input.blockads) {
var frags = request.url().split('/');
if (frags.length > 2) {
domain = frags[2];
}
}
if ((task.input.blockads && hosts[domain] === true) || (!task.input.includephotos && request.resourceType() === 'image')) {
request.abort();
}
else {
request.continue();
}
});
//This solution hugely improved the speed of our scraper.
但是我不知道这部分。 task.input.blockads 这是从无到有,我一无所知。 有想法吗?
答案 0 :(得分:0)
这只是一个启用/禁用检查主机文件的参数。 如果您一直想要检查,只需省略该部分
inorder_visit()