表解析使用Node.js并创建JSON对象

时间:2018-03-05 16:08:59

标签: json node.js express html-table html-parsing

我想遍历this table并将每个值保存在此类型的JSON对象中:

var json = { vaccine : "", country : "", year : "", value: ""};

那么,我可以将这些对象保存在MongoDB上。

该表非常复杂,我不知道如何遍历行和单元格以获得正确的值。

我知道存在this library但是我的表没有id,所以我不知道如何使用它。

我正在使用Node.jsExpress.jsCheerio。这是我的代码:

var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var router = express.Router();

router.get('/scraperWhoCoverage', function(req, res) {

    // the URL we will scrape from 
    var url = 'http://apps.who.int/immunization_monitoring/globalsummary/timeseries/tscoveragebcg.html';

    /**
     * The structure of our request call.
     * The first parameter is our URL.
     * The callback function takes 3 parameters: an error, a response status code and the html.
     */
    request(url, function(error, response, html) {

    // check to make sure no errors occurred when making the request
    if(error) {
        console.log(error);
    }
    else {
        console.log("Getted: ", url);

        // define the variables we're going to capture
        var vaccine, country, year, value, source;
        var json = { vaccine : "", country : "", year : "", value: "", source: ""};

        // utilize the cheerio library on the returned html which will essentially give us jQuery functionality
        var $ = cheerio.load(html);

        // get data
        $('#page').filter(function() {
            var data = $(this);
            var table = data.children().first().next();
            console.log(table);

            // iterate through table rows and cells
            for(var i = 0; table.rows.length; i++) {
                for(var j = 0; table.cells.length; j++) { 
                    // ??                   
                }
            }

        });
    }

    fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
        console.log('File successfully written! - Check your project directory for the output.json file');
    })

    // send out a message to the browser reminding you that this app does not have a UI
    res.send('This app does not have a UI. Check your console (command prompt)!');

    }); // end request


}) // end get

exports = module.exports = router;

由于

1 个答案:

答案 0 :(得分:0)

你应该使用这个npm包here(我已经使用过) 这将自动将您的html解析为json,然后您将能够使用循环重新组织此json对象:

{ vaccine : "", country : "", year : "", value: ""}

例如:

var tabletojson = require('tabletojson');
tabletojson.convertUrl(
  'http://apps.who.int/immunization_monitoring/globalsummary/timeseries/tscoveragedtp1.html',
  { stripHtmlFromCells: false },
  function(tablesAsJson) {
    //tablesAsJson is your json object, here reorganize your object and call a JSON.stringify to put it in output.json
  }
);