我在Express中使用Request.js和Cheerio.js构建一个简单的scraper。现在我只是在寻找网站的标题。而是逐个抓取网站,我把列表放在一个数组中。我解析它们然后使用Cheerio.js找到网站的标题。当我控制登录标题时,它们很好,但我想最终在html页面上显示它们。请注意,我对编程非常陌生,所以如果你能提供详细的反馈,那将是非常有用的(下面是我一直在研究的代码)。提前谢谢!
function parseSites(urls) {
var parsedSites = [];
urls.forEach(function(site) {
request(site, function(err, res, body) {
if(err) {
console.log(err);
} else {
var $ = cheerio.load(body);
parsedSites.push($('title').text());
}
}
});
});
return parsedSites;
}
答案 0 :(得分:1)
请参阅以下代码了解正常工作
var request = require('request-promise')
var cheerio = require("cheerio")
function parseSites(urls, callback) {
var parsedSites = [];
var promiseList = urls.map(getPage)
Promise.all(promiseList).then(function (data) {
callback(data.map(parse))
})
return parsedSites;
}
function getPage(url) {
return request.get(url)
}
function parse(body) {
console.log("parsing body")
var $ = cheerio.load(body);
return $('title').text()
}
parseSites(['https://www.google.com','https://www.facebook.com'],function(data) {
console.log(data)
})
答案 1 :(得分:0)
首先,您需要了解异步代码和同步代码之间的区别。让我们看一个例子:
function testFor() {
for(let i=0;i<5;++i){
console.log(i);
}
}
-
console.log('start:');
testFor();
console.log('end:');
// Here you get the expected output because this code is synchronous.
//output:
start:
0
1
2
3
4
end:
-
console.log('start:');
setTimeout(testFor,1000);
console.log('end:');
// Here you don't get your expected output because setTimeout is asynchronous .
//output:
start:
end:
0
1
2
3
4
下一点是代码中有错误!
function parseSites(urls) {
var parsedSites = [];
urls.forEach(function(site) {
request(site, function(err, res, body) {
if(err) {
console.log(err);
} else {
var $ = cheerio.load(body);
parsedSites.push($('title').text());
}
//} ! THIS bracket should be removed
});
});
return parsedSites;
}
所以你的问题是forEach循环中的'request'是一个异步函数,一旦有来自网页的响应就会调用回调'函数(错误,res,body)。
我的解决方案:
'use strict'
const cheerio = require('cheerio');
const request = require('request');
const async = require('async');
const urls = ['http://stackoverflow.com/','http://hackaday.com/','https://www.raspberrypi.org/','https://cheerio.js.org/'];
//SOLUTION 1: do what you need to do when all calls are done using recursion
let i=0;
let parsedSites = [];
parseSites(urls[i],parsedSites);
function finalCall(sites) {
console.log(sites);
}
function parseSites(site,parsedSites) {
++i;
request(site, function(err, res, body) {
if(err) {
console.log(err);
} else {
let $ = cheerio.load(body);
let title = $('title').text();
console.log(title);
parsedSites.push(title);
}
if(i<urls.length){
parseSites(urls[i],parsedSites);// recursive call;
}
else{
finalCall(parsedSites);// when all sites are done.
}
});
//return parsedSites;// cant return! we are in async calls!
}
//SOLUTION 2: do what you need to do when all calls are done using 'async'
parseSites(urls);
function finalCall(sites) {
console.log(sites);
}
function parseSites(urls) {
let parsedSites = [];
async.each(urls,function parseSite(site, callback) {
request(site, function (err, res, body) {
if (err) {
callback(err);
} else {
let $ = cheerio.load(body);
parsedSites.push($('title').text());
callback();
}
})
},function (err) {
if(err) console.log(err);
else finalCall(parsedSites);
});
}