JSON文件神秘地“清空”

时间:2014-11-26 08:40:14

标签: json node.js

我在Node.js中编写了一个小脚本来抓取网页并获取一些链接。报废部分由Cheerio完成。我的代码在这里(简化为空格):

var request = require('request');
var cheerio = require('cheerio');

var base_url = 'http://www.naftemporiki.gr/finance/';

var mutuals = {};
mutuals.date = new Date();
mutuals.companies = [];

var company = {};  

request(base_url + 'mtfCompanies', function (error, response, html) {
    if (!error && response.statusCode == 200) {
        var $ = cheerio.load(html);

        $('.blueRow.texttd.name a').each(function (i, element) {
            var a = $(this);

            company = {};
            company.name = a.text();
            company.link = a.attr('href');

            mutuals.companies.push(company);
        });
    }
    //console.log(mutuals);          // 1st place
});
console.log(mutuals);                // 2nd place

以下是有趣的部分:当我尝试从“第一位”输出JSON文档时,在“请求”块中,它出来很好而且真实。这里有一个例子:

{ date: Wed Nov 26 2014 10:35:09 GMT+0200 (EET),
  companies: 
   [ { name: ' J.P. MORGAN ASSET MANAGEMENT',
       link: 'mtfCompany?id=J.P.+MORGAN+ASSET+MANAGEMENT' },
     { name: ' BNP PARIBAS INVESTMENT PARTNERS',
       link: 'mtfCompany?id=BNP+PARIBAS+INVESTMENT+PARTNERS' },
     { name: ' PICTET', link: 'mtfCompany?id=PICTET' },
     { name: ' ALLIANZ ΑΕΔΑΚ',
       link: 'mtfCompany?id=ALLIANZ+%ce%91%ce%95%ce%94%ce%91%ce%9a' },
     { name: ' ALLIANZ ΑΕΔΑΚ (ΑΝΤΙΠΡ.)',
       link: 'mtfCompany?id=ALLIANZ+%ce%91%ce%95%ce%94%ce%91%ce%9a+(%ce%91%ce%9d%ce%a4%ce%99%ce%a0%ce%a1.)' },
     { name: ' ALLIANZ ΕΛΛΑΣ Α.Ε.',
       link: 'mtfCompany?id=ALLIANZ+%ce%95%ce%9b%ce%9b%ce%91%ce%a3+%ce%91.%ce%95.' }]}

当我尝试从“第二位”输出JSON文档时,在任何块之外和执行结束时,这就是我得到的:

{ date: Wed Nov 26 2014 10:35:09 GMT+0200 (EET), companies: [] }

看起来JSON文档中的'companies'数组被清空了。我怀疑'mutuals.companies = [];'由于某种原因,行再次被执行。

任何人都可以帮忙吗?

更新1:

根据建议修改我的代码以使用'async.series ...'。这是更新版本:

var request = require('request'),
    async = require('async'),
    cheerio = require('cheerio');

var base_url = 'http://www.naftemporiki.gr/finance/';

var mutuals = {};
mutuals.date = new Date();
mutuals.companies = [];

var company = {};

async.series([
    function(callback) {
        request(base_url + 'mtfCompanies', function (error, response, html) {
            if (!error && response.statusCode == 200) {
                var $ = cheerio.load(html);

                $('.blueRow.texttd.name a').each(function (i, element) {
                    var a = $(this);

                    company = {};
                    company.name = a.text();
                    company.link = a.attr('href');

                    mutuals.companies.push(company);
                });
            }
        });
        callback(null, 'one');
    },
    function (callback) {
        console.log(mutuals);
        callback(null, 'two');
    }
]);

仍然无效。输出的JSON仍然是:

{ date: Wed Nov 26 2014 10:35:09 GMT+0200 (EET), companies: [] }

1 个答案:

答案 0 :(得分:3)

您的“第二名”是在请求完成之前打印变量。

您的“第一名”有效,因为它位于请求的回调中。发出请求,拉出数据,然后调用回调并成功打印。

这是异步代码的工作方式。没有什么阻止。因此,当您发出请求时,node会存储回调函数,以便它可以使用请求的结果执行代码。

更新1:

您的更新问题大致相同。在系列的第一个函数中,callback在请求完成之前被调用。如果将回调移动到传递给请求的函数中,则在请求完成后调用它。

function(callback) {
    request(base_url + 'mtfCompanies', function (error, response, html) {
        if (!error && response.statusCode == 200) {
            var $ = cheerio.load(html);

            $('.blueRow.texttd.name a').each(function (i, element) {
                var a = $(this);

                company = {};
                company.name = a.text();
                company.link = a.attr('href');

                mutuals.companies.push(company);
            });
            callback(null, 'one');
        }
    });
},

建议1

使用回调在node.js中进行开发可以为您提供深层嵌套结构。不要让你的if语句使嵌套变得更糟。使用早期返回而不是更深的嵌套。例如:

function(callback) {
    request(base_url + 'mtfCompanies', function (error, response, html) {
        if(error) return callback(error);
        if(response.statusCode !== 200) return callback('status code not 200');
        var $ = cheerio.load(html);

        $('.blueRow.texttd.name a').each(function (i, element) {
            var a = $(this);

            company = {};
            company.name = a.text();
            company.link = a.attr('href');

            mutuals.companies.push(company);
        });
        callback(null, 'one');
    });
},

建议2

使用async时,它可以通过使用命名函数来简化操作。例如:

var request = require('request'),
    async = require('async'),
    cheerio = require('cheerio');

var base_url = 'http://www.naftemporiki.gr/finance/';

var mutuals = {};
mutuals.date = new Date();
mutuals.companies = [];

var company = {};

function getPage(callback) {
    request(base_url + 'mtfCompanies', function (error, response, html) {
        if(error) return callback(error);
        if(response.statusCode !== 200) return callback('status code not 200');
        var $ = cheerio.load(html);

        $('.blueRow.texttd.name a').each(function (i, element) {
            var a = $(this);

            company = {};
            company.name = a.text();
            company.link = a.attr('href');

            mutuals.companies.push(company);
        });
        callback(null, 'one');
    });
}

function printMutuals(callback) {
    console.log(mutuals);
    callback(null, 'two');
}

async.series([
    getPage,
    printMutuals
]);