我正在尝试使用无限滚动分页刮擦某个网站。到目前为止,我已经能够成功刮取“第一”页面,但不能成功刮除其余页面。 “ paginationNumber”(又称“页数”)的最大值波动,通常为5-10。我想以某种方式遍历paginationNumber尝试抓取所有内容,并将其全部合并到siteBody变量中,而不只是第一页中。是否有一个npm库?我在缠头时遇到麻烦。这是我用来实现仅第一页的抓取代码:
//THE HTTP REQUEST
//define the callback functions for the http request
var callback2 = function(err, httpResponse2, body){
siteBody = body;
if (err){throw err;}
else {
newFileActions();
}
}
}
var paginationNumber=0;
var callback1 = function(err, httpResponse, body, paginationNumber){
if (err){console.log(err); throw err;}
else {
var options2 = {
//NOTE: urlString2 ends in "page=" so that the paginationNumber comes next
url: urlString2+paginationNumber
,gzip: true
,headers:{
'Host': hostName
,'User-Agent': myUserAgent
,'Accept': myAccept
,'Accept-Language': myAcceptLanguage
,'Accept-Encoding': myAcceptEncoding
,'Referer': "https://"+hostName
,'Cookie': httpResponse.headers["set-cookie"][0].substr(0,httpResponse.headers["set-cookie"][0].indexOf(";"))
,'Connection': "keep-alive"
,'Upgrade-Insecure-Requests': "1"
}
};
request(options2, callback2);
}
};
var callback0 = function(err, httpResponse0, body){
if (err){throw err;}
else {
var options1 = {
url: urlString1
,headers:{
'Host': hostName
,'User-Agent': myUserAgent
,'Accept': myAccept
,'Accept-Language':myAcceptLanguage
,'Accept-Encoding': myAcceptEncoding
,'Referer': "https://"+hostName+'/login'
,'Cookie': httpResponse0.headers["set-cookie"][0].substr(0,httpResponse0.headers["set-cookie"][0].indexOf(";"))
,'Content-Type': "application/x-www-form-urlencoded"
,'Content-Length': Buffer.byteLength(querystring.stringify(postData))
,'Connection': "keep-alive"
,'Upgrade-Insecure-Requests': "1"
,'DNT': "1"
,'TE': "Trailers"
}
,form: {email: myEmail, password: myPassword}
};
request.post(options1, callback1);
}
}
var options0 = {
url: urlString0
,headers:{
'Host': hostName
,'User-Agent': myUserAgent
,'Accept': myAccept
,'Accept-Language':myAcceptLanguage
,'Accept-Encoding': myAcceptEncoding
,'Referer': "https://"+hostName
,'Connection': "keep-alive"
,'Upgrade-Insecure-Requests': "1"
}
};
request(options0, callback0);
感谢您能提供的任何帮助