我正在编写一个程序来搜索链接的网站,然后抓取这些链接以获取信息。为了刮取网站,有必要先登录。所以顺序是:登录 - >刮取链接的索引 - >抓取信息链接
对login函数的回调打印出一个空数组{ results: [], hasMore: true }
,所以我的代码有问题(抓取部分有效):
var request = require('request');
var request = request.defaults({jar: true}); // necessary for persistent login
var cheerio = require('cheerio');
var url1 = "https://example.org/torrents/browse/index/";
var loginUrl = "https://example.org/user/account/login/";
var credentials = {
username: 'user1',
password: 'passpass'
};
login(function (result) {
console.log(result);
});
function login(callback) {
request.post({
uri: loginUrl,
headers: { 'content-type': 'application/x-www-form-urlencoded' },
body: require('querystring').stringify(credentials)
}, function(err, res, body){
if(err) {
console.log("Login error");
return;
}
scrapeTorrents(url1, function (result) {
callback(result);
});
});
}
function scrapeTorrents(url, callback) {
request(url, function(err, res, body) {
if(err) {
console.log("Main scrape error");
return;
}
var links = []
var $ = cheerio.load(body);
$('span.title').each(function(i, element){
var title = $(this);
var a = $(this).children().eq(0);
var detailsUrl = a.attr('href');
//console.log(detailsUrl);
links.push(detailsUrl);
});
scrapeTorrentDetails(links, function (result) {
callback(result);
});
});
}
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
});
}
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
}
也许Q承诺会更好。我将如何在上面的代码中实现它?
如果你想知道代码的用途,我计划修改Popcorn-time以使用另一个torrent跟踪器(没有API)。
由于
答案 0 :(得分:1)
主要问题在于此代码:
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i]);
}
callback( {
results: results,
hasMore: true
});
getDetails()
是异步的,但您只需将其称为links.length
次并继续前进 - 就像他们已经完成一样。因此,在调用回调并尝试传递结果之前,getDetails()
中的所有请求都没有完成。但是,结果都没有填写,所以它们将是空的。
你在代码中随处可见所有其他嵌套回调(根据需要),但是你把球放在这个地方。在使用结果调用最终回调之前,您需要知道所有getDetails()
调用何时完成。
此外,您还必须决定是否可以并行呼叫所有getDetails()
个呼叫(一次全部在线)或者您真正想做的是呼叫一个,等待为了它完成,然后调用下一个,等等...现在你将它们全部放在飞行中,如果目标服务器不能同时反对那么多请求,那么它可以工作。
有几种可能的解决方法。
向getDetails()
添加一个回调,然后计算links.length
来自getDetails()
的{{1}}回调的时间,并且仅在整个计数完成后才计算打电话给最后的回电。
更改getDetails()
以返回承诺。然后,您可以使用links.map(getDetails)
之类的内容创建一系列承诺,然后您可以使用Promise.all()
来了解它们何时完成。
就个人而言,我会更改您的所有代码以使用promises,并且我会使用Bluebird promises库来获取Promise.map()
等额外功能,以使其更加简单。
这是一个修补程序,它会向getDetails()
添加一个回调,然后计算完成的数量:
function scrapeTorrentDetails(links, callback) {
var results = [];
function getDetails(url, done) {
request(url, function(err, res, body) {
if(err) {
console.log("Detail scrape error");
done(err);
return;
}
console.log("Scraping: " + url);
var $ = cheerio.load(body);
var tds = $('td');
var title = $(tds).get(1).firstChild.data;
var hash = $(tds).get(3).firstChild.data.trim();
var size = $(tds).get(9).firstChild.data;
// console.log(tds.length);
if (tds.length > 23) {
var rlsDate = $(tds).get(23).firstChild.data || '';;
var genres = $(tds).get(27).firstChild.data || '';;
var runtime = $(tds).get(31).firstChild.data || '';;
if ( $(tds).get(33).firstChild != null) {
var plot = $(tds).get(33).firstChild.data || '';;
}
var rating = $('#imdb_rating').parent().next().text() || '';; // of 10
var imdb_id = $('[name=imdbID]').get(0).attribs.value || '';;
var cover = $('#cover').children().eq(0).get(0).attribs.href || '';;
var thumb = $('[alt=Cover]').get(0).attribs.src || '';;
if (typeof cover == 'undefined') {
cover = thumb;
}
} else {
var rlsDate = "notfound";
var genres = "notfound";
var runtime = "notfound";
var plot = "notfound";
var rating = "notfound"; // of 10
var imdb_id = "notfound";
var cover = "notfound";
var thumb = "notfound";
}
var movie = {
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
};
results.push(movie);
done();
});
}
var doneCnt = 0;
for (var i=0; i<links.length; i++){
getDetails("https://example.org" + links[i], function() {
++doneCnt;
if (doneCnt === links.length) {
callback( {
results: results,
hasMore: true
});
}
});
}
}
答案 1 :(得分:0)
以下是使用bind重写的给定示例代码,自定义此对象以及尚未完成的请求计数(我认为promises掩盖了执行路径)。
回调返回空数组的原因似乎是文档中没有带有title属性的跨度,因此不会触发其他请求。
var
request = require('request').defaults({
jar: true
}), // necessary for persistent login
cheerio = require('cheerio'),
process = require('process'),
url1 = "https://example.org/torrents/browse/index/",
loginUrl = "https://example.org/user/account/login/",
login = function(callback) {
request.post({
uri: loginUrl,
headers: {
'content-type': 'application/x-www-form-urlencoded'
},
body: require('querystring').stringify({
username: 'user1',
password: 'passpass'
})
}, fna.bind({
callback: callback
}));
},
fna = function(err, res, body) {
if (err) {
console.log("Login error");
return;
}
request(url1, fnb.bind(this));
},
fnb = function(err, res, body) {
if (err) {
console.log("Main scrape error");
return;
}
var
$ = cheerio.load(body),
links = [],
fnd = fne.bind(this);
$('span.title').each(function() {
links.push($(this).children().first().attr('href'));
});
this.results = [];
this.resultCount = links.length;
if (this.resultCount) {
fnd = fnc.bind(this);
for (var i = 0; i < links.length; i++) {
request("https://example.org" + links[i], fnd);
}
} else {
process.nextTick(fnd);
}
},
fnc = function(err, res, body) {
if (err) {
console.log("Detail scrape error");
return;
}
console.log("Scraping: " + url);
var
$ = cheerio.load(body),
tds = $('td'),
title = $(tds).get(1).firstChild.data,
hash = $(tds).get(3).firstChild.data.trim(),
size = $(tds).get(9).firstChild.data,
rlsDate = "notfound",
genres = "notfound",
runtime = "notfound",
plot = "notfound",
rating = "notfound", // of 10
imdb_id = "notfound",
cover = "notfound",
thumb = "notfound";
if (tds.length > 23) {
rlsDate = $(tds).get(23).firstChild.data || '';
genres = $(tds).get(27).firstChild.data || '';
runtime = $(tds).get(31).firstChild.data || '';
if ($(tds).get(33).firstChild != null) {
plot = $(tds).get(33).firstChild.data || '';
}
rating = $('#imdb_rating').parent().next().text() || ''; // of 10
imdb_id = $('[name=imdbID]').get(0).attribs.value || '';
cover = $('#cover').children().eq(0).get(0).attribs.href || '';
thumb = $('[alt=Cover]').get(0).attribs.src || '';
if (typeof cover == 'undefined') {
cover = thumb;
}
}
this.results.push({
type: 'movie',
imdb_id: imdb_id,
title: title,
year: rlsDate,
genre: genres,
rating: rating,
runtime: runtime,
image: thumb,
cover: cover,
synopsis: plot,
torrents: {
magnet: 'magnet:?xt=urn:btih:' + hash + '&tr=http://tracker.example.org:2710/a/announce',
filesize: size
}
});
this.resultCount--;
if (this.resultCount === 0) {
this.callback({
results: this.results,
hasMore: true
});
}
},
fne = function() {
this.callback({
results: this.results,
hasMore: true
});
};
login(function(result) {
console.log(result);
});