我想抓取网站的帖子登录页面。我使用chrome登录到网站,然后使用chrome cookie inspector插件为网站提取Cookie。 https://chrome.google.com/webstore/detail/cookie-inspector/jgbbilmfbammlbbhmmgaagdkbkepnijn?hl=en
然后我使用node-crawler模块对网站进行爬网。
https://www.npmjs.com/package/crawler
但是我不能正确设置cookie。 这是我的代码:
var Crawler = require("crawler");
var fs = require('fs')
var c = new Crawler({
maxConnections : 10,
jQuery: false,
headers:{Cookie: [
{
"domain": "www.example.com",
"hostOnly": true,
"httpOnly": true,
"name": "BIGipServerfk.example.com-80",
"path": "/",
"sameSite": "no_restriction",
"secure": false,
"session": true,
"storeId": "0",
"value": "3358858762.20480.0000",
"id": 1
},
{
"domain": "www.example.com",
"hostOnly": true,
"httpOnly": true,
"name": "JSESSIONID",
"path": "/",
"sameSite": "no_restriction",
"secure": false,
"session": true,
"storeId": "0",
"value": "49A78003F8C87804475AE5F151FC4BEE.0605",
"id": 2
}]},
// This will be called for each crawled page
callback : function (error, res, done) {
if(error){
console.log(error);
}else{
fs.writeFile('./example.html', res.body, ()=>{})
}
done();
}
});
// Queue just one URL, with default callback
c.queue('https://www.example.com/');
这是我创建的唯一的东西:
https://github.com/bda-research/node-crawler/issues/165
https://github.com/bda-research/node-crawler/issues/187
如何设置Cookie?显然,该模块使用请求模块作为依赖项,我该如何使用艰难的cookie?