我在Node.js中编写了一个小脚本来抓取网页并获取一些链接。报废部分由Cheerio完成。我的代码在这里(简化为空格):
var request = require('request');
var cheerio = require('cheerio');
var base_url = 'http://www.naftemporiki.gr/finance/';
var mutuals = {};
mutuals.date = new Date();
mutuals.companies = [];
var company = {};
request(base_url + 'mtfCompanies', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
}
//console.log(mutuals); // 1st place
});
console.log(mutuals); // 2nd place
以下是有趣的部分:当我尝试从“第一位”输出JSON文档时,在“请求”块中,它出来很好而且真实。这里有一个例子:
{ date: Wed Nov 26 2014 10:35:09 GMT+0200 (EET),
companies:
[ { name: ' J.P. MORGAN ASSET MANAGEMENT',
link: 'mtfCompany?id=J.P.+MORGAN+ASSET+MANAGEMENT' },
{ name: ' BNP PARIBAS INVESTMENT PARTNERS',
link: 'mtfCompany?id=BNP+PARIBAS+INVESTMENT+PARTNERS' },
{ name: ' PICTET', link: 'mtfCompany?id=PICTET' },
{ name: ' ALLIANZ ΑΕΔΑΚ',
link: 'mtfCompany?id=ALLIANZ+%ce%91%ce%95%ce%94%ce%91%ce%9a' },
{ name: ' ALLIANZ ΑΕΔΑΚ (ΑΝΤΙΠΡ.)',
link: 'mtfCompany?id=ALLIANZ+%ce%91%ce%95%ce%94%ce%91%ce%9a+(%ce%91%ce%9d%ce%a4%ce%99%ce%a0%ce%a1.)' },
{ name: ' ALLIANZ ΕΛΛΑΣ Α.Ε.',
link: 'mtfCompany?id=ALLIANZ+%ce%95%ce%9b%ce%9b%ce%91%ce%a3+%ce%91.%ce%95.' }]}
当我尝试从“第二位”输出JSON文档时,在任何块之外和执行结束时,这就是我得到的:
{ date: Wed Nov 26 2014 10:35:09 GMT+0200 (EET), companies: [] }
看起来JSON文档中的'companies'数组被清空了。我怀疑'mutuals.companies = [];'由于某种原因,行再次被执行。
任何人都可以帮忙吗?
更新1:
根据建议修改我的代码以使用'async.series ...'。这是更新版本:
var request = require('request'),
async = require('async'),
cheerio = require('cheerio');
var base_url = 'http://www.naftemporiki.gr/finance/';
var mutuals = {};
mutuals.date = new Date();
mutuals.companies = [];
var company = {};
async.series([
function(callback) {
request(base_url + 'mtfCompanies', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
}
});
callback(null, 'one');
},
function (callback) {
console.log(mutuals);
callback(null, 'two');
}
]);
仍然无效。输出的JSON仍然是:
{ date: Wed Nov 26 2014 10:35:09 GMT+0200 (EET), companies: [] }
答案 0 :(得分:3)
您的“第二名”是在请求完成之前打印变量。
您的“第一名”有效,因为它位于请求的回调中。发出请求,拉出数据,然后调用回调并成功打印。
这是异步代码的工作方式。没有什么阻止。因此,当您发出请求时,node会存储回调函数,以便它可以使用请求的结果执行代码。
更新1:
您的更新问题大致相同。在系列的第一个函数中,callback
在请求完成之前被调用。如果将回调移动到传递给请求的函数中,则在请求完成后调用它。
function(callback) {
request(base_url + 'mtfCompanies', function (error, response, html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
callback(null, 'one');
}
});
},
建议1
使用回调在node.js中进行开发可以为您提供深层嵌套结构。不要让你的if语句使嵌套变得更糟。使用早期返回而不是更深的嵌套。例如:
function(callback) {
request(base_url + 'mtfCompanies', function (error, response, html) {
if(error) return callback(error);
if(response.statusCode !== 200) return callback('status code not 200');
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
callback(null, 'one');
});
},
建议2
使用async
时,它可以通过使用命名函数来简化操作。例如:
var request = require('request'),
async = require('async'),
cheerio = require('cheerio');
var base_url = 'http://www.naftemporiki.gr/finance/';
var mutuals = {};
mutuals.date = new Date();
mutuals.companies = [];
var company = {};
function getPage(callback) {
request(base_url + 'mtfCompanies', function (error, response, html) {
if(error) return callback(error);
if(response.statusCode !== 200) return callback('status code not 200');
var $ = cheerio.load(html);
$('.blueRow.texttd.name a').each(function (i, element) {
var a = $(this);
company = {};
company.name = a.text();
company.link = a.attr('href');
mutuals.companies.push(company);
});
callback(null, 'one');
});
}
function printMutuals(callback) {
console.log(mutuals);
callback(null, 'two');
}
async.series([
getPage,
printMutuals
]);