我正在抓一个粉丝网站,以便在我的网络应用中显示角色信息,但我遇到了Cant Set headers after they are sent.
我正在尝试在我的请求中使用承诺,但我想我可能对什么有一个根本的误解我的代码实际上在做。
最终目标是通过循环遍历一组boss名称,将数据存储在一个数组中,然后最终将其导出以供以后使用,从而刮掉100页的数据。目前,我能够将数据存储在数组中,但即使我的代码执行并擦除数据,仍会出现错误。
server.js
var express = require('express');
var cheerio = require('cheerio');
var app = express();
var rp = require('request-promise');
var fsp = require('fs-promise');
app.get('/', function(req, res){
urls = [
'fansite/boss1', 'fansite/boss2'
];
var bosses = [];
function parse(html) {
var $ = cheerio.load(html);
$('.page-header__title').filter(function () {
var data = $(this);
name = data.text();
bosses.push(name);
})
console.log(bosses);
return bosses;
}
urls.forEach(function (url) {
rp(url)
.then(parse)
.then(res.send('Bosses Updated.'))
.catch(err => console.log('Error:', err));
});
})
app.listen('8081')
console.log('Running on port 8081');
exports = module.exports = app;
输出:
node server.js start
Running on port 8081
[ 'Obor' ]
[ 'Obor', 'Zulrah' ]
Error: Error: Can't set headers after they are sent.
at ServerResponse.OutgoingMessage.setHeader (_http_outgoing.js:356:11)
at ServerResponse.header (/Users/aaron/Personal Projects/node-scraper/node_modules/express/lib/response.js:767:10)
at ServerResponse.send (/Users/aaron/Personal Projects/node-scraper/node_modules/express/lib/response.js:170:12)
at rp.then.then (/Users/aaron/Personal Projects/node-scraper/server.js:31:21)
at tryCatcher (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/util.js:16:23)
at Promise._settlePromiseFromHandler (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:512:31)
at Promise._settlePromise (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:569:18)
at Promise._settlePromise0 (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:614:10)
at Promise._settlePromises (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/promise.js:693:18)
at Async._drainQueue (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/async.js:133:16)
at Async._drainQueues (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/async.js:143:10)
at Immediate.Async.drainQueues (/Users/aaron/Personal Projects/node-scraper/node_modules/bluebird/js/release/async.js:17:14)
at runCallback (timers.js:672:20)
at tryOnImmediate (timers.js:645:5)
at processImmediate [as _immediateCallback] (timers.js:617:5)
答案 0 :(得分:2)
如果您想在发送回复之前等待处理所有网址
Promise.all(urls.map(function (url) {
return rp(url).then(parse);
}))
.then(() => res.send('Bosses Updated.'))
.catch(err => console.log('Error:', err));
或
Promise.all(urls.map(url => rp(url).then(parse)))
.then(() => res.send('Bosses Updated.'))
.catch(err => console.log('Error:', err));
答案 1 :(得分:1)
res.send
向客户端发送完整的HTTP响应,包括标头和内容,这就是您无法多次调用它的原因。
答案 2 :(得分:1)
您可以使用Promise.all
并抓住个人请求,这样您就不会失去成功的请求。在Promise.all(所有请求完成)之后的响应中报告回复:
const Fail = function(reason){this.reason=reason;};
const isFail = x=>(x&&x.constructor)===Fail;
const isNotFail = x=>!isFail(x);
Promise.all(
urls.map(
url=>
rp(url)
.then(parse)
.catch(err => new Fail([url,err]))
)
)
.then(
results=>
res.json(results)
);
如果您向网站发出许多请求,则可能需要限制请求。要么您正在进行多少次打开请求,要么在特定时间段内要进行多少次请求。您可以使用throttling执行此操作,但如果您的快速应用程序是公共站点,可能有许多用户可以开始抓取您,那么最好确保不让目标站点将您的抓取视为攻击。
const max = throttle(8)//maximum 8 open connections
//const max = throttlePeriod(8,1000);//maximum 8 requests per second
Promise.all(
urls.map(
url=>
max(rp)(url)//throttle requests made
.then(parse)
.catch(err => new Fail([url,err]))
)
)
.then(
results=>
res.send(JSON.parse)
)