抓取一个html页面并将其变成一个json对象

时间:2017-04-04 16:29:09

标签: javascript html json node.js

我试图抓取一个html页面并将其变成一个json对象

这是页面页面

03-04-2017-output-1.txt
03-04-2017-output-2.txt
04-04-2017-output-1.txt
04-04-2017-output-2.txt

这是我的尝试:

<html><head><title>Index</title><meta charset="UTF-8"></head><body><div><p>[ <a href="index.html">Index</a> ] | [ <a href="config.html">Device Config</a> ]</p></div><div>Neighbors<pre>fe80::212:4b00:8b8:6ecb REACHABLE</pre></div><div>Default Route<pre>fe80::212:4b00:8b8:6ecb</pre></div><div>Routes<pre></pre></div><div>Sensors<pre>Battery Temp = 19 C
Battery Volt = 3320 mV
Air Pressure = 1031.12 hPa
Air Temp = 22.66 C
Object Temp = 12.375 C
Ambient Temp = 23.062 C
Light = 0.00 lux
HDC Humidity = 43.93 %RH
HDC Temp = 23.03 C
Acc X = 0.02 G
Acc Y = 0.02 G
Acc Z = -1.10 G
Gyro X = -2.93 deg per sec
Gyro Y = -2.74 deg per sec
Gyro Z = 5.18 deg per sec</pre></div><div>Page hits: 4<br>Uptime: 138 secs<br></div></body></html>

如何实现将传感器div放入包含传感器名称作为键的对象和传感器数据作为属性的对象

更新:

感谢RafalWiliński帮助我以某种方式设法使其工作,但最后一个关键是将div作为对象中的值

新代码:

var request = require('request');
var cheerio = require('cheerio');





request('http://[aaaa::212:4b00:c2a:b704]/index.html', function(error, response,html){
        if(!error && response.statusCode == 200){
          //JSON.parse(html)
          //console.log('--------------------------------------');
          var temp = {"id":html}
          var obj = JSON.parse(temp)
          console.log(JSON.stringify(obj));
        }

});

但我的输出是

var request = require('request');
var cheerio = require('cheerio');



 request('http://[aaaa::212:4b00:c2a:b704]/index.html', function(error, response,html){
        if(!error && response.statusCode == 200){

          var obj = {};
          html.split('\n').forEach((line) => {
             var key = line.split(' = ')[0];
             var value = line.split(' = ')[1];
             obj[key] = value;
          });
          console.log(JSON.stringify(obj,null,' '))


        }});

2 个答案:

答案 0 :(得分:2)

您需要将字符串除以=个符号。前面的部分是关键,后面的部分是你的价值。

以下功能可能会解决此问题:

function jsonify(str) {
   var obj = {};
   str.split('\n').forEach((line) => {
      var key = line.split(' = ')[0];
      var value = line.split(' = ')[1];
      obj[key] = value;
   });
   return obj;
}

答案 1 :(得分:1)

我建议您使用HTML解析器(我个人认为jQuery易于使用,但there are a LOT of options)来查找和获取特定元素的内容。然后,您可以在结果上运行解析逻辑。

var response = '<html><head><title>Index</title><meta charset="UTF-8"></head><body><div><p>[ <a href="index.html">Index</a> ] | [ <a href="config.html">Device Config</a> ]</p></div><div>Neighbors<pre>fe80::212:4b00:8b8:6ecb REACHABLE</pre></div><div>Default Route<pre>fe80::212:4b00:8b8:6ecb</pre></div><div>Routes<pre></pre></div><div>Sensors<pre>Battery Temp = 19 C\nBattery Volt = 3320 mV\nAir Pressure = 1031.12 hPa\nAir Temp = 22.66 C\nObject Temp = 12.375 C\nAmbient Temp = 23.062 C\nLight = 0.00 lux\nHDC Humidity = 43.93 %RH\nHDC Temp = 23.03 C\nAcc X = 0.02 G\nAcc Y = 0.02 G\nAcc Z = -1.10 G\nGyro X = -2.93 deg per sec\nGyro Y = -2.74 deg per sec\nGyro Z = 5.18 deg per sec</pre></div><div>Page hits: 4<br>Uptime: 138 secs<br></div></body></html>';

// Turn the result into an HTML DOM.
var responseDOM = $(response);

// Find the specific element you want (in this case, the third pre) and get its content.
var preContent = $('pre', responseDOM).eq(3).text();

// Now, split the content into lines, split again by " = ", and then merge the result back into a single object.
var obj = preContent
      // Split content into lines (by "\n")
      .split('\n')
      // split each line into key and value (by " = ")
      .map(line => line.split(' = '))
      // reduce each key value pair into a single object with properties
      .reduce( (acc,kvp) => { acc[kvp[0]] = kvp[1]; return acc; }, {})


// Finally, turn the object into a JSON string.
var json = JSON.stringify(obj);

console.log(json);
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js"></script>