在运行下一页之前等待Javascript Web Scraping Function完成?

时间:2015-01-16 00:08:19

标签: javascript jquery json node.js

我正在尝试创建一个web scraper(在node.js中),它将从站点中提取信息,并将其写入文件。我已经构建了正确地为一个页面工作,但是当我尝试在for循环中使用该函数时,为了遍历多个游戏,我在所有游戏中都得到了错误的数据。

我知道这与Javascript的异步性质有关,我已经阅读了有关回调函数的内容,但我不确定我是否理解如何将其应用于我的代码。任何帮助都将非常感激:

for(x = 4648; x < 4650; x++){  //iterate over a few gameIDs, used in URL for request
    scrapeGame(x);
}

function scrapeGame(gameId){
    //request from URL, scrape HTML to arrays as necessary
    //write final array to file
}

基本上,我要做的是,在for循环中,告诉它等待完成scrapeGame(x)函数,然后递增x并运行它以进行下一个游戏 - 否则,数组开始覆盖彼此和数据变得一团糟。

编辑:我现在已经包含了我试图运行的完整代码!我在编写文件后查看文件时遇到错误。例如,第一个文件是8kb,第二个是〜16,第三个是~32,等等。在运行下一个游戏之前似乎没有被清除。

该计划的想法是从存档网站提取Jeopardy问题/答案,以便最终为自己构建一个测验应用程序。

//Iterate over arbitrary number of games, scrape each

for(x = 4648; x < 4650; x++){
    scrapeGame(x, function(scrapeResult) {
        if(scrapeResult){
            console.log('Scrape Successful');
        } else {
            console.log('Scrape ERROR');
        }
    });
}

function scrapeGame(gameId, callback){
    var request = require('request');
        cheerio = require('cheerio');
        fs = require('fs');
        categories = [];
        categorylist = [];
        ids = [];
        clues = [];
        values = ['0','$200','$400','$600','$800','$1000','$400','$800','$1200','$1600','$2000'];
        valuelist = [];
        answers = [];
        array = [];
        file = [];
        status = false;

    var showGameURL = 'http://www.j-archive.com/showgame.php?game_id=' + gameId;
    var showAnswerURL = 'http://www.j-archive.com/showgameresponses.php?game_id=' + gameId;

    request(showGameURL, function(err, resp, body){ 
    if(!err && resp.statusCode === 200){
        var $ = cheerio.load(body);
        //add a row to categories to avoid starting at 0
        categories.push('Category List');
        //pull all categories to use for later
        $('td.category_name').each(function(){
            var category = $(this).text();
            categories.push(category);
        });
        //pull all clue IDs (coordinates), store to 1d array
        //pull any id that has "stuck" in the string, to prevent duplicates
        $("[id*='stuck']").each(function(){
            var id = $(this).attr('id');
            id = id.toString();
            id = id.substring(0, id.length - 6);
            ids.push(id);
            //if single J, pick category 1-6
            if (id.indexOf("_J_") !== -1){
                var catid = id.charAt(7);
                categorylist.push(categories[catid]);
                var valId = id.charAt(9);
                valuelist.push(values[valId]);
            }
            //if double J, pick category 7-12
            else if (id.indexOf("_DJ_") !== -1){
                var catid = parseInt(id.charAt(8)) + 6;
                categorylist.push(categories[catid]);
                var valId = parseInt(id.charAt(10)) + 5;
                valuelist.push(values[valId]);                
            }
            //if final J, pick category 13
            else {
                categorylist.push(categories[13]);
            }
        });
        //pull all clue texts, store to 1d array
        $('td.clue_text').each(function(){
            var clue = $(this).text();
            clues.push(clue);
        });
        //push pulled values to big array
        array.push(ids);
        array.push(categorylist);
        array.push(valuelist);
        array.push(clues);

        //new request to different URL to pull responses
        request(showAnswerURL, function(err, resp, body){ 
            if(!err && resp.statusCode === 200){
                var $ = cheerio.load(body);

                $('.correct_response').each(function(){
                    var answer = $(this).text();
                    answers.push(answer);
                });
                //push answers to big array
                array.push(answers);
                //combine arrays into 1-d array to prep for writing to file
                for(var i = 0; i < array[0].length; i++){
                    var print = array[0][i] + "|" + array[1][i] + "|" + array[2][i] + "|" + array[3][i] + "|" + array[4][i];
                    var stringPrint = print.toString();
                    file.push(stringPrint);
                }
                //update string, add newlines, etc.
                var stringFile = JSON.stringify(file);
                stringFile = stringFile.split('\\').join('');
                stringFile = stringFile.split('","').join('\n');
                //write to file, eventually will append to end of one big file
                fs.writeFile('J_GAME_' + gameId +'.txt', stringFile, function(err) {
                    if(err) {
                        console.log(err);
                    } else {
                        console.log("Game #" + gameId + " has been scraped.");
                        status = true;
                    }
                });
            }
        });
    }
});
        //clear arrays used
        valuelist = [];
        answers = [];
        categories = [];
        categorylist = [];
        ids = [];
        clues = [];
        array = [];
        file = [];
        //feed callback status
        callback(status);
}

3 个答案:

答案 0 :(得分:3)

// Iterate over a few gameIDs, used in URL for request.
for (x = 4648; x < 4650; x++) {
  // Pass in the callback as an anonymous function.
  // So below I am passing in the id and the function I want to execute.
  // AND, defining the results I am expecting as passed in arguments. 
  scrapeGame(x, function(scrapeResult, err) {
    // This will *NOT* execute *UNTIL* you call it in the function below.
    // That means that the for loop's execution is halted. 
    // This function receives the status that is passed in, 
    // in this case, a boolean true/false and an error if any.
    if (scrapeResult) {
      // Scrape was true, nothing to do.
      // The for loop will now move on to the next iteration.
      console.log('Scrape Successful');
    } else {
      // Scrape was false, output error to console.log and 
      // break loop to handle error.
      console.log('Scrape ERROR :: ' + err);
      // Notice we are calling break while in the 
      // scope of the callback function
      // Remove the break if you want to just move onto
      // the next game ID and not stop the loop
      break;
    }
  });
}

// This function now accepts two arguments.
function scrapeGame(gameId, callback) {

  // ************************************************
  // ** Do Your Work Here **
  // Request from URL, scrape HTML to arrays as necessary.
  // Write final array to file.
  // After file creation, execute the callback and pass bool
  // status (true/false).
  // ************************************************

  var request = require('request'),
      cheerio = require('cheerio'),
      fs = require('fs'),
      categories = [],
      categorylist = [],
      ids = [],
      clues = [],
      values = [
          '0',
          '$200',
          '$400',
          '$600',
          '$800',
          '$1000',
          '$400',
          '$800',
          '$1200',
          '$1600',
          '$2000'
      ],
      valuelist = [],
      answers = [],
      array = [],
      file = [],
      showGameURL = 'http://www.j-archive.com/showgame.php?game_id=' + gameId,
      showAnswerURL = 'http://www.j-archive.com/showgameresponses.php?game_id=' + gameId;

  request(showGameURL, function(err, resp, body) {
    if (!err && resp.statusCode === 200) {
      var $ = cheerio.load(body);
      //add a row to categories to avoid starting at 0
      categories.push('Category List');
      //pull all categories to use for later
      $('td.category_name').each(function() {
        var category = $(this).text();
        categories.push(category);
      });
      //pull all clue IDs (coordinates), store to 1d array
      //pull any id that has "stuck" in the string, to prevent duplicates
      $("[id*='stuck']").each(function() {
        var id = $(this).attr('id');
        id = id.toString();
        id = id.substring(0, id.length - 6);
        ids.push(id);
        //if single J, pick category 1-6
        if (id.indexOf("_J_") !== -1) {
          var catid = id.charAt(7);
          categorylist.push(categories[catid]);
          var valId = id.charAt(9);
          valuelist.push(values[valId]);
        }
        //if double J, pick category 7-12
        else if (id.indexOf("_DJ_") !== -1) {
          var catid = parseInt(id.charAt(8)) + 6;
          categorylist.push(categories[catid]);
          var valId = parseInt(id.charAt(10)) + 5;
          valuelist.push(values[valId]);
        }
        //if final J, pick category 13
        else {
          categorylist.push(categories[13]);
        }
      });
      //pull all clue texts, store to 1d array
      $('td.clue_text').each(function() {
        var clue = $(this).text();
        clues.push(clue);
      });
      //push pulled values to big array
      array.push(ids);
      array.push(categorylist);
      array.push(valuelist);
      array.push(clues);

      //new request to different URL to pull responses
      request(showAnswerURL, function(err, resp, body) {
        if (!err && resp.statusCode === 200) {
          var $ = cheerio.load(body);

          $('.correct_response').each(function() {
            var answer = $(this).text();
            answers.push(answer);
          });
          //push answers to big array
          array.push(answers);
          //combine arrays into 1-d array to prep for writing to file
          for (var i = 0; i < array[0].length; i++) {
            var print = array[0][i] + "|" + array[1][i] + "|" + array[2][i] + "|" + array[3][i] + "|" + array[4][i];
            var stringPrint = print.toString();
            file.push(stringPrint);
          }
          //update string, add newlines, etc.
          var stringFile = JSON.stringify(file);
          stringFile = stringFile.split('\\').join('');
          stringFile = stringFile.split('","').join('\n');
          //write to file, eventually will append to end of one big file
          fs.writeFile('J_GAME_' + gameId + '.txt', stringFile, function(err) {

            //clear arrays used
            valuelist = [];
            answers = [];
            categories = [];
            categorylist = [];
            ids = [];
            clues = [];
            array = [];
            file = [];

            if (err) {
              // ******************************************
              // Callback false with error.
              callback(false, err);
              // ******************************************
            } else {
              console.log("Game #" + gameId + " has been scraped.");
              // ******************************************
              // Callback true with no error. 
              callback(true);
              // ******************************************
            }
          });
        }
      });
    }
  });
}

答案 1 :(得分:1)

我的假设是你希望它们一个接一个地被刮掉,而不是并行。所以,对于循环没有帮助。以下方法应该可以解决问题:

    var x = 4648;
    var myFunc = scrapeGame(x, function cb(){
        if(x >= 4650){
           return; 
        }
        x++;
        return myFunc(x, cb); 
    });



function scrapeGame(gameId){
    //request from URL, scrape HTML to arrays as necessary
    //write final array to file
}

对于嵌套异步函数,您希望它们以串行方式执行,您应该忘记for循环。

使用http客户端正确处理请求的示例:

function scrapeGame(gameId, cb){

//your code and set options

http.request(options, function(response){
    var result = "";
    response.on('data', function (chunk) {
                result += chunk;
             });
    response.on('end',function(){
               //write data here;

               //do the callback
               cb();    
            });
});

}

答案 2 :(得分:0)

我解决了我所看到的问题的ROOT原因,虽然我确实相信没有上面红色的回调协助,但我会一直迷失。

原来数据处理正确,但文件写入正在加扰。事实证明,有一种不同的方法可以调用而不是writeFile或appendFile:

fs.appendFileSync();

调用同步版本处理了写入文件的命令,这些文件已经附加到文件中,而不仅仅是为了它。除了上面的回调帮助之外,这还解决了这个问题。

感谢大家的帮助!