我有一种爬虫,它通过HTTPS从一个网站下载页面。我使用request库。
前几个请求没问题,然后他们开始超时。我一次只发送一个请求并等待它完成。 当它开始超时并且我在同一台计算机上的浏览器中打开页面时,它会正常加载。
我认为这与Keep-Alive有关,但我不确定。也许我需要在几次请求后刷新与服务器的连接,但我不知道如何使用这个库。
这是我的请求库包装器:
var iconv = require('iconv-lite');
var async = require("async");
var cheerio = require("cheerio");
var request = require("request");
var socks_client = require("socks-client");
var _ = require("underscore");
var settings = require("../settings");
var socksHttpAgent = settings.useTor ? new socksClient.Agent(
{ proxy: {
ipaddress: "127.0.0.1",
port: 9050,
type: 5,
}},
false, // we are connecting to a HTTPS server, false for HTTP server
false // rejectUnauthorized option passed to tls.connect(). Only when secure is set to true
) : undefined;
var socksHttpsAgent = settings.useTor ? new socksClient.Agent(
{ proxy: {
ipaddress: "127.0.0.1",
port: 9050,
type: 5,
}},
true, // we are connecting to a HTTPS server, false for HTTP server
false // rejectUnauthorized option passed to tls.connect(). Only when secure is set to true
) : undefined;
var req = module.exports.req = function(method, options, callback) {
async.retry(settings.maxRetries, function(callback) {
request[method](_.extend({
timeout: settings.timeout * 1000,
encoding: null,
agent: options.url.match(/^https:/) ? socksHttpsAgent : socksHttpAgent
}, _.omit(options, "encoding", "cheerio")), function(err, res, body) {
if(err)
console.error("Chyba požadavku: ", err, options.url, "; zkouším to znovu...");
callback(err, body);
});
}, function(err, data) {
if(err) {
callback(err);
return;
}
data = options.encoding ? iconv.decode(data, options.encoding) : data.toString("utf8");
callback(null, options.cheerio ? cheerio.load(data) : data);
});
};
module.exports.get = req.bind(null, "get");
module.exports.post = req.bind(null, "post");
我关闭了广告,因此undefined
传递给agent
选项。