我尝试使用node.js上的request,cheerio和async构建自定义Web scraper。 我无法弄清楚为什么我在公司参数中得到一些未定义的值导致瀑布崩溃。 我知道代码有点乱,但它大部分都是可读的。我认为我的问题可能来自回调电话。
async.waterfall([
function(callback){
var base_url = 'http://www.architonic.com/fr/pmfairexh/imm-cologne/8550409/';
_.times(3, function(n){
var url = base_url+(n+1);
request(url, function (error, response, html) {
if (error)
callback(url);
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var links_companies = $('#sheet_content_inside > ul > li > div h2 > a'),
//links_companies = $('a', li_companies),
companies = [];
//console.log(colors.blue(url), links_companies.length);
links_companies.each(function(i, a) {
companies.push({name: $(this).attr('title'), url_from: url, next_url: $(this).attr('href')});
});
callback(null, companies);
}
});
});
},
function(companies, callback){
async.map(companies, function(c, cb){
var url = c.next_url;
if(!_.isUndefined(url))
request(url, function (error, response, html) {
console.log(url, c.name);
if (error)
cb(url);
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html),
profile_link = $('#head_main_content > div > div:nth-child(4) > h6 > a'),
cons = (typeof profile_link == 'undefined')? "WHAT?" : profile_link.attr('href');
//console.log("url founded: "+cons);
// if(typeof profile_link == 'undefined')
// cb(c);
c.origin_url = c.url_from;
c.next_url = profile_link.attr('href');
c.url_from = url;
c.profile_url = c.next_url;
//console.log(c);
// if (!_.isUndefined(c.next_url))
cb(null, c);
// else
// cb(c);
//return _.extend({}, c, {profile_url: profile_link.attr('href'), origin_url: c.url_from});
}
});
}, function(err, _companies){
callback(null, _companies);
});
},
function(companies, callback){
async.map(companies, function(c, cb){
var url = c.next_url;
console.log(colors.green(url), colors.red(c.name));
// if (_.isUndefined(url))
// return cb(c);
if(!_.isUndefined(url))
request(url, function (error, response, html) {
//console.log(url, c.name);
if (error)
cb(url);
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html),
left_zone = $('#sheet_content_inside > div.margin_top_20 > div.left'),
right_zone = $('#sheet_content_inside > div.margin_top_20 > div.right.width_195');
//console.log(left_zone.html(), right_zone.html());
var name = $('span[itemprop="name"]', left_zone).text(),
s_address = $('span[itemprop="streetAddress"]', left_zone).text(),
p_code = $('span[itemprop="postalCode"]', left_zone).text(),
city = $('span[itemprop="addressLocality"]', left_zone).text(),
country = $('span[itemprop="addressCountry"]', left_zone).text();
console.log(name, s_address, p_code, city, country);
cb(null, c);
}
});
}, function(err, _companies){
callback(null, _companies);
});
}
], function(err, companies){
console.log(colors.warn("end"), companies.length);
console.log(err);
console.log(companies);
});
答案 0 :(得分:0)
在找到真正的罪魁祸首之前,您可以清理公司数组,然后再将其传递给下一个函数或每个async.map之前。像这样;
companies = _.compact(companies)
这将确保列表清除空值和未定义