我想遍历this table并将每个值保存在此类型的JSON对象中:
var json = { vaccine : "", country : "", year : "", value: ""};
那么,我可以将这些对象保存在MongoDB上。
该表非常复杂,我不知道如何遍历行和单元格以获得正确的值。
我知道存在this library但是我的表没有id,所以我不知道如何使用它。
我正在使用Node.js
,Express.js
和Cheerio
。这是我的代码:
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var router = express.Router();
router.get('/scraperWhoCoverage', function(req, res) {
// the URL we will scrape from
var url = 'http://apps.who.int/immunization_monitoring/globalsummary/timeseries/tscoveragebcg.html';
/**
* The structure of our request call.
* The first parameter is our URL.
* The callback function takes 3 parameters: an error, a response status code and the html.
*/
request(url, function(error, response, html) {
// check to make sure no errors occurred when making the request
if(error) {
console.log(error);
}
else {
console.log("Getted: ", url);
// define the variables we're going to capture
var vaccine, country, year, value, source;
var json = { vaccine : "", country : "", year : "", value: "", source: ""};
// utilize the cheerio library on the returned html which will essentially give us jQuery functionality
var $ = cheerio.load(html);
// get data
$('#page').filter(function() {
var data = $(this);
var table = data.children().first().next();
console.log(table);
// iterate through table rows and cells
for(var i = 0; table.rows.length; i++) {
for(var j = 0; table.cells.length; j++) {
// ??
}
}
});
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log('File successfully written! - Check your project directory for the output.json file');
})
// send out a message to the browser reminding you that this app does not have a UI
res.send('This app does not have a UI. Check your console (command prompt)!');
}); // end request
}) // end get
exports = module.exports = router;
由于
答案 0 :(得分:0)
你应该使用这个npm包here(我已经使用过) 这将自动将您的html解析为json,然后您将能够使用循环重新组织此json对象:
{ vaccine : "", country : "", year : "", value: ""}
例如:
var tabletojson = require('tabletojson');
tabletojson.convertUrl(
'http://apps.who.int/immunization_monitoring/globalsummary/timeseries/tscoveragedtp1.html',
{ stripHtmlFromCells: false },
function(tablesAsJson) {
//tablesAsJson is your json object, here reorganize your object and call a JSON.stringify to put it in output.json
}
);