我有一个包含多个页面的网站,每个页面都列出了我想要删除和下载的下载链接。 我几乎没有问题:
content-disposition
但我认为浏览器遵循重定向,重定向的URL在其路径中包含文件名。我目前的代码如下:
var request = require('request');
var cheerio = require('cheerio');
var https = require('follow-redirects').https;
require('events').EventEmitter.prototype._maxListeners = 1000;
for(var i = 1; i <= 10000; i++) {
(function(i){
url = 'http://mywebsite.com/files?page=' + i;
request(url, gotHTML)
})(i);
}
function gotHTML(err, resp, html) {
var $ = cheerio.load(html);
$('.file-header').each(function() {
var data = $(this);
var fileLink = data.children().first().children().first().attr('href');
var fileName = fileLink.substring(10);
var downloadLink = 'https://mywebsite.com/api/download/' + fileName;
download(downloadLink, function() {
console.log('downloaded');
})
})
}
function download(url, cb) {
var request = https.get(url, function(response) {
var location = request.res.headers.location;
console.log(location);
location = location.split('/').pop();
console.log(location);
var file = fs.createWriteStream(location);
response.pipe(file);
file.on('finish', function() {
file.close(cb);
});
});
}
答案 0 :(得分:1)
默认HTTP / HTTPS代理仅对同一来源的请求使用最多5个套接字(maxSockets
)。所以这可能会给你带来一些问题。
尝试更改此内容:
var request = https.get(url, function(response) {
到此:
var options = require('url').parse(url);
options.agent = false; // or use a custom https.Agent with a higher `maxSockets`
var request = https.get(options, function(response) {