Question

I am trying to produce the following JSON structure:

"events": [
        {
            "start_date": {
                "year": "602"
            },  
            "end_date": {
                "year":"629"
            },
            "media": {
                "url": "https://en.wikipedia.org/wiki/Roman-Persian_Wars"
            },
            "background": {
        "opacity":"50",
                "url": "https://upload.wikimedia.org/wikipedia/commons/a/a2/HumiliationValerianusHolbein.jpg"
            },
            "text": {
                "headline": "Last great Roman-Persian War.",
                "text": "Long conflict leaves both empires exhausted and unable to cope with the newly united Arab armies under Islam in the 630s"
            }
        },
    {
    "start_date": {
        "year": "604"
    },
    "end_date": {
        "year": "609"
    },
    "media": {
        "url": "https://en.wikipedia.org/wiki/Grand_Canal_(China)"
    },
    "background": {
"opacity":"50",
        "url": "https://upload.wikimedia.org/wikipedia/commons/a/ad/Sui_Wendi_Tang.jpg"
    },
    "text": {
        "headline": "Grand Canal in China is fully completed",
        "text": "Its main role throughout its history was the transport of grain to the capital."
    }
}

There are about 25 objects within the events array, but to make this shorter I've only included two here.

For the moment I'm only trying to create the "sub-objects" of background and text within the master object.

I am scraping this Wikipedia page with Node and the request and cheerio libraries: https://en.wikipedia.org/wiki/Timeline_of_the_Middle_Ages

The first part of my code (below) uses the request library to gather together all the links to other pages from the main Wikipedia "landing page":

request(landingPage, function (err, response, body) {
    var $ = cheerio.load(body);
    var absoluteLinks = [];

// GET REMOTE PAGE LINKS FOR IMAGES:
    // eq(1) = 7th Century Table
    $('.wikitable').eq(1).find('tr').each(function() {
        var $link = $(this).find('td').eq(2).find('a').eq(0).attr('href');
        if ( $link != undefined || $link != null ) {
          absoluteLinks.push("https://en.wikipedia.org" + $link);
        }
    });
    getRemoteImages(absoluteLinks);
 });

The second part uses Promise.all in order to ensure that the array of image urls is constructed in the same order as the array of scraped page urls:

function getRemoteImages(absoluteLinks) {
    Promise.all(absoluteLinks.map (function (a) {
     return new Promise(function(resolve, reject) {
       request(a, function(err, response, body) {
         if (err) { return reject(err); }
         $ = cheerio.load(body);
         var $thumbImg = $('.infobox').find('img').eq(0).attr('src');
         // To do: make full size image
         $thumbImg = "https:" + $thumbImg;
          resolve({ thumbImg: $thumbImg });
       });
     });
    })).then(function (result) {
      cleanImages(result);
    }).catch(function(err) {
      console.log(err);
    });
}

The third part of the code is the bit I'm struggling with:

function buildTextSection(result) {

    request(landingPage, function (err, response, body) {
      var data = { "events": [] };

      $ = cheerio.load(body);
      $('.wikitable').eq(1).find('tr').each(function() {
        var evObj = {};
          var $headline = $(this).find('td').eq(2).html();
          var $text = $(this).find('td').eq(3).text();
          evObj.text = {"headline": $headline, "text": $text };

          data.events.push(evObj);
      }); // end each
      console.log(data.events.length);
      buildImageSection(data, result);
    });
  }

  function buildImageSection(data, result) {

    result.forEach(function(obj) {
      data.events.background = {"opacity": "50", "url": obj.thumbImg };
      console.log(data);
      // console.log(data.events);
    }); // end forEach
  }

I can't find a way integrate the two different each iterators (one gathering the text data from the "local" landing page, and the other gathering the image urls from each "remote" Wikipedia destination page).

My last attempt (of many) generates this output when I run the file in the terminal with console.log(data);

{ events: 
   [ { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     { text: [Object] },
     background: { opacity: '50',
       url: 'https://upload.wikimedia.org/wikipedia/commons/thumb/1/10/History_of_Korea-Inter-country_Age-830_CE.gif/220px-History_of_Korea-Inter-country_Age-830_CE.gif' } ] }

How can I solve this problem? I've completely run out of ideas now! Thanks!

Answer 1

表示背景属性部分：

<{1>}中的

，在每个循环中覆盖相同的buildImageSection属性。如果您确定background和data.events的顺序相同（且元素数相同），则可以通过数字索引进行访问：

results

如果您对其他部分有疑问，我可以详细说明，但请尽量准确地提出您的问题;）

编辑更新：遗憾的是，如果它们的长度不同，则需要进行一些更改。您需要一些方法来识别行以找回它们。为了保持简单，我只会考虑你一次解析一个表（我看到wiki在不同的表中将它们分割了几个世纪）。如果你想获得所有的表格，你将需要另一种方式跟踪表格，就像行一样。这可以使用关联数组来完成，例如，使用格式化为function buildImageSection(data, result) { var index = 0; result.forEach(function(obj) { //data.events.background = {"opacity": "50", "url": obj.thumbImg }; data.events[index].background = {"opacity": "50", "url": obj.thumbImg }; index++; console.log(data); // console.log(data.events); }); // end forEach }

的键

幸运的是，您可以使用jquery识别行索引（我注释了所有内容以突出显示所做的更改）：

table1row4

然后对于其余部分，您希望以相同的方式对数组进行排序：

/*request(landingPage, function (err, response, body) {
    var $ = cheerio.load(body);
    var absoluteLinks = {};

    // GET REMOTE PAGE LINKS FOR IMAGES:
    // eq(1) = 7th Century Table
    $('.wikitable').eq(1).find('tr').each(function() {
        var $link = $(this).find('td').eq(2).find('a').eq(0).attr('href');
        if ( $link != undefined || $link != null ) {*/

          //i show here the simpler method, using the tr index to be sure that numeric
          //indexes match, but this won't work if you wan to store multiple
          //tables in the same array
          absoluteLink[$(this).index()] = "https://en.wikipedia.org" + $link;

        /*}
    });
    getRemoteImages(absoluteLinks);
});*/

最后，你必须在填充它之前测试数组中图像的存在（你必须继续反过来循环data.events，因为javascript不适合带有数字索引的数组和“漏洞” “在里面”

/*function buildTextSection(result) {

    request(landingPage, function (err, response, body) {
      var data = { "events": [] };

      $ = cheerio.load(body);
      $('.wikitable').eq(1).find('tr').each(function() {
        var evObj = {};
          var $headline = $(this).find('td').eq(2).html();
          var $text = $(this).find('td').eq(3).text();
          evObj.text = {"headline": $headline, "text": $text };*/

          //the tr index again
          data.events[$(this).index()] = evObj;


      /*}); // end each
      console.log(data.events.length);
      buildImageSection(data, result);
    });
}*/

How to combine result of two each iterators into one JSON array of objects

1 个答案: