未处理的拒绝RequestError:错误:读取ETIMEDOUT

时间:2017-10-18 08:37:42

标签: javascript node.js express asynchronous

我正在Nodejs中构建一个脚本来从某个网站上抓取7000个网址。

脚本在开始时应该运行,但是在一段时间后它看起来正在失去速度和功率并开始记录错误直到它停止(参见下面的错误)。

我可以使用Node / express的超时设置做什么或者在我的脚本中设置超时吗?

错误:

Unhandled rejection RequestError: Error: read ETIMEDOUT
    at new RequestError (/Users/robbertvermeulen/Workspace/Node/scrape/node_modules/request-promise-core/lib/errors.js:14:15)
    at Request.plumbing.callback (/Users/robbertvermeulen/Workspace/Node/scrape/node_modules/request-promise-core/lib/plumbing.js:87:29)
    at Request.RP$callback [as _callback] (/Users/robbertvermeulen/Workspace/Node/scrape/node_modules/request-promise-core/lib/plumbing.js:46:31)
    at self.callback (/Users/robbertvermeulen/Workspace/Node/scrape/node_modules/request/request.js:186:22)
    at emitOne (events.js:115:13)
    at Request.emit (events.js:210:7)
    at Request.onRequestError (/Users/robbertvermeulen/Workspace/Node/scrape/node_modules/request/request.js:878:8)
    at emitOne (events.js:115:13)
    at ClientRequest.emit (events.js:210:7)
    at TLSSocket.socketErrorListener (_http_client.js:401:9)
    at emitOne (events.js:115:13)
    at TLSSocket.emit (events.js:210:7)
    at emitErrorNT (internal/streams/destroy.js:64:8)
    at _combinedTickCallback (internal/process/next_tick.js:138:11)
    at process._tickCallback (internal/process/next_tick.js:180:9)

我的剧本:

var express = require('express');
var fs      = require('fs');
var request = require('request-promise');
var cheerio = require('cheerio');
var async   = require('async');
var app     = express();

app.get('/scrape', function(req, res) {

   // Base url
   const baseURL = 'https://www.example.com';

   // Final data
   var json = [];   

   var archiveURLs = []; // Holds a list of archive urls

   // Loop trough archive urls
   async.forEach(archiveURLs, function(url, mainCallback) {

      async.waterfall([

         // Find post urls on archive urls
         function(firstCallback) {

            request(url, function(error, response, html) {

               if (! error && 200 == response.statusCode) {
                  var paths = [];
                  $ = cheerio.load(html);
                  $('h2 a').each(function(index, element) {
                     var path = $(element).attr('href');
                     paths.push(path);
                  });
                  firstCallback(null, paths);
               }
            });

         },

         // Fetch data from post urls
         function(paths, secondCallback) {

            var fetchData = [];

            async.forEach(paths, function(path, loopCallback) {

               var item = {title: '', url: ''};
               var url = baseURL + path;

               console.log('Fetching', url);

               request(url, function(error, response, html) {
                  if (! error) {
                     $ = cheerio.load(html);
                     item.title = $('h1').text().trim();
                     item.url = url;
                     fetchData.push(item);
                     loopCallback();
                  }
               });

            }, function(err) {

               // Store fetched data in array
               for (i = 0; i < fetchData.length; i ++) {
                  json.push(fetchData[i]);
               }
               secondCallback(null, 1);
            });
         }

      ],
      function(err, result) {
         mainCallback();
      });

   }, function(err) {

      // Scrape finished

   });

   // Writes message to browser
   res.send('Check your console!')

})

app.listen('8081');
console.log('Magic happens on port 8081');
exports = module.exports = app;

更新

我现在处理我的错误:

request(url, function(error, response, html) {
   if (! error) {
      if (200 == response.statusCode) {
         // Code
         loopCallback();
      }
   } else {
      return res.end(error.message);
   }
}).on('error', function(e) {
   console.log(e);
}).end();

0 个答案:

没有答案