绕过Node的异步性

时间:2016-07-25 01:21:33

标签: javascript jquery node.js asynchronous npm

我正在编写一个内容抓取工具,用于删除特定网站上的衬衫信息。我已经在Node中使用NPM包设置了所有内容来抓取并创建CSV文件。我遇到的问题是,众所周知,Node本质上是异步的。我试图编写的CSV文件是在我创建的JSON对象完成创建之前编写的(使用每个循环进行迭代来构建它),因此它传入json2csv(npm包)的'fields'参数。但它将我的数据作为空对象传递。任何人都可以告诉我如何告诉节点等到我的json对象生成之后再尝试使用fs.writefile创建CSV文件?谢谢

'use strict';

//require NPM packages

var request = require('request');
var cheerio = require('cheerio');
var fs = require('fs');
var json2csv = require('json2csv');

//Array for shirts JSON object for json2csv to write.
var ShirtProps = [];

var homeURL = "http://www.shirts4mike.com/";

//start the scraper
scraper(); 

//Initial scrape of the shirts link from the home page
function scraper () {
  //use the datafolderexists function to check if data is a directory
  if (!DataFolderExists('data')) {
    fs.mkdir('data');
  }
  //initial request of the home url + the shirts.php link
  request(homeURL + "shirts.php", function (error, response, html) {
    if (!error && response.statusCode == 200) {
      var $ = cheerio.load(html);

      //scrape each of the links for its html data
      $('ul.products li').each(function(i, element){
        var ShirtURL = $(this).find('a').attr('href');
        console.log(ShirtURL);
        //pass in each shirtURL data to be scraped to add it to an object
        ShirtHTMLScraper(ShirtURL);
      }); 
      FileWrite();
      // end first request
    } else {
      console.error(error);
    }
  });
}

//create function to write the CSV file.
function FileWrite() {
  var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Time'];
  var csv = json2csv({data: ShirtProps, fields: fields}); 
  console.log(csv);
  var d = new Date();
  var month = d.getMonth()+1;
  var day = d.getDate();
  var output = d.getFullYear() + '-' +
  ((''+month).length<2 ? '0' : '') + month + '-' +
  ((''+day).length<2 ? '0' : '') + day;

  fs.writeFile('./data/' + output + '.csv', csv, function (error) {
    if (error) throw error;      
  });    
}

//function to scrape each of the shirt links and create a shirtdata object for each.
function ShirtHTMLScraper(ShirtURL) {
  request(homeURL + ShirtURL, function (error, response, html) {
    if (!error && response.statusCode == 200) {
      var $ = cheerio.load(html);
      var time = new Date().toJSON().substring(0,19).replace('T',' ');
      //json array for json2csv
      var ShirtData = {
        title: $('title').html(),
        price: $(".price").html(),
        imgURL: $('img').attr('src'),
        url: homeURL + ShirtURL,
        time: time.toString() 
      };
      //push the shirt data scraped into the shirtprops array
      ShirtProps.push(ShirtData);
      console.log(ShirtProps);

      // //set the feilds in order for the CSV file
      // var fields = ['Title', 'Price', 'ImageURL', 'URL', 'Time'];

      // //use json2csv to write the file -

      // var csv = json2csv({data: ShirtProps, fields: fields}); 
      // console.log(csv);

      // //date for the filesystem to save the scrape with today's date.
      // var d = new Date();
      // var month = d.getMonth()+1;
      // var day = d.getDate();
      // var output = d.getFullYear() + '-' +
      // ((''+month).length<2 ? '0' : '') + month + '-' +
      // ((''+day).length<2 ? '0' : '') + day;

      //   //use filesystem to write the file, or overrite if it exists.
      //     fs.writeFile('./data/' + output + '.csv', csv, function (error) {
      //       if (error) throw error;

      //     }); //end writeFile
    } else {
      console.error(error);
    }
  });
}

//Check if data folder exists, source: http://stackoverflow.com/questions/4482686/check-synchronously-if-file-directory-exists-in-node-js
function DataFolderExists(folder) {
  try {
    // Query the entry
    var DataFolder = fs.lstatSync(folder);

    // Is it a directory?
    if (DataFolder.isDirectory()) {
      return true;
    } else {
      return false;
    }
  } //end try
  catch (error) {
    console.error(error);
  }
}

1 个答案:

答案 0 :(得分:3)

并不是说节点本质上是异步的,因为某些函数是异步的。在这种情况下,它是使用请求的异步调用。你在第二个请求调用(ShirtHTMLScraper内部的一个)开始后直接调用FileWrite。在填充ShirtProps之后,将调用FileWrite放在ShirtHTMLScraper的回调中。

编辑:仔细观察后,这也无济于事。问题是你在同步循环中调用异步函数。您可以通过创建一个计数器来实现这一点,该计数器在每个异步回调时递增,并检查您是否达到了要迭代的项目的长度。如果您正在进行最后一次迭代,请运行FileWrite。

更好的方法可能是查看Async库。你可以使用.each()来提供两个回调,一个在每次迭代时运行,一个在它们全部完成时运行。