我正在Nodejs中构建一个脚本来从某个网站上抓取7000个网址。
脚本在开始时应该运行,但是在一段时间后它看起来正在失去速度和功率并开始记录错误直到它停止(参见下面的错误)。
我可以使用Node / express的超时设置做什么或者在我的脚本中设置超时吗?
错误:
Unhandled rejection RequestError: Error: read ETIMEDOUT
at new RequestError (/Users/robbertvermeulen/Workspace/Node/scrape/node_modules/request-promise-core/lib/errors.js:14:15)
at Request.plumbing.callback (/Users/robbertvermeulen/Workspace/Node/scrape/node_modules/request-promise-core/lib/plumbing.js:87:29)
at Request.RP$callback [as _callback] (/Users/robbertvermeulen/Workspace/Node/scrape/node_modules/request-promise-core/lib/plumbing.js:46:31)
at self.callback (/Users/robbertvermeulen/Workspace/Node/scrape/node_modules/request/request.js:186:22)
at emitOne (events.js:115:13)
at Request.emit (events.js:210:7)
at Request.onRequestError (/Users/robbertvermeulen/Workspace/Node/scrape/node_modules/request/request.js:878:8)
at emitOne (events.js:115:13)
at ClientRequest.emit (events.js:210:7)
at TLSSocket.socketErrorListener (_http_client.js:401:9)
at emitOne (events.js:115:13)
at TLSSocket.emit (events.js:210:7)
at emitErrorNT (internal/streams/destroy.js:64:8)
at _combinedTickCallback (internal/process/next_tick.js:138:11)
at process._tickCallback (internal/process/next_tick.js:180:9)
我的剧本:
var express = require('express');
var fs = require('fs');
var request = require('request-promise');
var cheerio = require('cheerio');
var async = require('async');
var app = express();
app.get('/scrape', function(req, res) {
// Base url
const baseURL = 'https://www.example.com';
// Final data
var json = [];
var archiveURLs = []; // Holds a list of archive urls
// Loop trough archive urls
async.forEach(archiveURLs, function(url, mainCallback) {
async.waterfall([
// Find post urls on archive urls
function(firstCallback) {
request(url, function(error, response, html) {
if (! error && 200 == response.statusCode) {
var paths = [];
$ = cheerio.load(html);
$('h2 a').each(function(index, element) {
var path = $(element).attr('href');
paths.push(path);
});
firstCallback(null, paths);
}
});
},
// Fetch data from post urls
function(paths, secondCallback) {
var fetchData = [];
async.forEach(paths, function(path, loopCallback) {
var item = {title: '', url: ''};
var url = baseURL + path;
console.log('Fetching', url);
request(url, function(error, response, html) {
if (! error) {
$ = cheerio.load(html);
item.title = $('h1').text().trim();
item.url = url;
fetchData.push(item);
loopCallback();
}
});
}, function(err) {
// Store fetched data in array
for (i = 0; i < fetchData.length; i ++) {
json.push(fetchData[i]);
}
secondCallback(null, 1);
});
}
],
function(err, result) {
mainCallback();
});
}, function(err) {
// Scrape finished
});
// Writes message to browser
res.send('Check your console!')
})
app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;
更新
我现在处理我的错误:
request(url, function(error, response, html) {
if (! error) {
if (200 == response.statusCode) {
// Code
loopCallback();
}
} else {
return res.end(error.message);
}
}).on('error', function(e) {
console.log(e);
}).end();