NodeJS Express Web抓取头问题

时间:2018-01-31 02:46:03

标签: javascript node.js express

我正在抓一个粉丝网站,以便在我的网络应用中显示角色信息,但我遇到了Cant Set headers after they are sent.我正在尝试在我的请求中使用承诺,但我想我可能对什么有一个根本的误解我的代码实际上在做。

最终目标是通过循环遍历一组boss名称,将数据存储在一个数组中,然后最终将其导出以供以后使用,从而刮掉100页的数据。目前,我能够将数据存储在数组中,但即使我的代码执行并擦除数据,仍会出现错误。

server.js

var express = require('express');
var cheerio = require('cheerio');
var app = express();
var rp = require('request-promise');
var fsp = require('fs-promise');

app.get('/', function(req, res){

  urls = [
    'fansite/boss1', 'fansite/boss2'
  ];

  var bosses = [];

  function parse(html) {

    var $ = cheerio.load(html);

    $('.page-header__title').filter(function () {
      var data = $(this);
      name = data.text();
      bosses.push(name);
    })
    console.log(bosses);
    return bosses;
  }

  urls.forEach(function (url) {
    rp(url)
    .then(parse)
    .then(res.send('Bosses Updated.'))  
    .catch(err => console.log('Error:', err));
  });
})

app.listen('8081')
console.log('Running on port 8081');
exports = module.exports = app;

输出:

node server.js start
Running on port 8081
[ 'Obor' ]
[ 'Obor', 'Zulrah' ]
Error: Error: Can't set headers after they are sent.
    at ServerResponse.OutgoingMessage.setHeader (_http_outgoing.js:356:11)
    at ServerResponse.header (/Users/aaron/Personal Projects/node-scraper/node_modules/express/lib/response.js:767:10)
    at ServerResponse.send (/Users/aaron/Personal Projects/node-scraper/node_modules/express/lib/response.js:170:12)
    at rp.then.then (/Users/aaron/Personal Projects/node-scraper/server.js:31:21)
    at tryCatcher (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/util.js:16:23)
    at Promise._settlePromiseFromHandler (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:512:31)
    at Promise._settlePromise (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:569:18)
    at Promise._settlePromise0 (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:614:10)
    at Promise._settlePromises (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:693:18)
    at Async._drainQueue (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/async.js:133:16)
    at Async._drainQueues (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/async.js:143:10)
    at Immediate.Async.drainQueues (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/async.js:17:14)
    at runCallback (timers.js:672:20)
    at tryOnImmediate (timers.js:645:5)
    at processImmediate [as _immediateCallback] (timers.js:617:5)

3 个答案:

答案 0 :(得分:2)

如果您想在发送回复之前等待处理所有网址

Promise.all(urls.map(function (url) {
  return rp(url).then(parse);
}))
.then(() => res.send('Bosses Updated.'))  
.catch(err => console.log('Error:', err));

Promise.all(urls.map(url => rp(url).then(parse)))
.then(() => res.send('Bosses Updated.'))  
.catch(err => console.log('Error:', err));

答案 1 :(得分:1)

res.send向客户端发送完整的HTTP响应,包括标头和内容,这就是您无法多次调用它的原因。

答案 2 :(得分:1)

您可以使用Promise.all并抓住个人请求,这样您就不会失去成功的请求。在Promise.all(所有请求完成)之后的响应中报告回复:

const Fail = function(reason){this.reason=reason;};
const isFail = x=>(x&&x.constructor)===Fail;
const isNotFail = x=>!isFail(x);
Promise.all(
  urls.map(
    url=>
      rp(url)
      .then(parse)
      .catch(err => new Fail([url,err]))
  )
)
.then(
  results=>
    res.json(results)
);

如果您向网站发出许多请求,则可能需要限制请求。要么您正在进行多少次打开请求,要么在特定时间段内要进行多少次请求。您可以使用throttling执行此操作,但如果您的快速应用程序是公共站点,可能有许多用户可以开始抓取您,那么最好确保不让目标站点将您的抓取视为攻击。

const max = throttle(8)//maximum 8 open connections
//const max = throttlePeriod(8,1000);//maximum 8 requests per second
Promise.all(
  urls.map(
    url=>
      max(rp)(url)//throttle requests made
      .then(parse)
      .catch(err => new Fail([url,err]))
  )
)
.then(
  results=>
    res.send(JSON.parse)
)