刮不正确循环的问题

时间:2014-01-10 00:29:09

标签: javascript node.js web-scraping

经过一个下午尝试通过谷歌/这个网站解决问题 - 我在这里。我是Node的新手并试图构建一个简单的scraper。这是节点脚本:

var request = require('request')
  , cheerio = require('cheerio')
  , async = require('async')
  , format = require('util').format;

var link_ids = [     '3380003','3380043','3380045','3380065','3381016','3382322','3382330','3382333','3382335','3382337','3382338','3382398','3382399','3382413','3385073','3385079','3385081','3385091','3385092','3385094','3385108','3385212','3385305','3385308','3385309','3385310','3385312','3385313','3385314','3385316','3385317','3385319','3385320','3385322','3385323','3385325','3385326','3385327','3385328','3386099','3386107','3386167','3386179','3386307','3386480','3386708','3388618','3388642','3388657','3388662','3388687','3388700','3388710','3388714','3388724','3388739','3388743','3388746','3388755','3388760','3388775','3388777','3388781','3388791','3388795','3388799','3388814','3388815','3388816','3388817','3388818','3388819','3388820','3388821','3388822','3388823','3388839','3388842','3388845','3388851','3388853','3388857','3388858','3388859','3388861','3388862','3388865','3388866','3388867','3388869','3388870','3388872','3388874','3388877','3388878','3388879','3388880','3388882','3388884','3388886','3388887','3388890','3388893','3388944','3388954','3388965','3388966','3388967','3388976','3388992','3389252','3389610','3390415','3395441','3397159','3397715','3401362','3401363','3401368','3401369','3401370','3401469','3403141','3403179','3403210','3403214','3403224','3403227','3403284','3403296','3403320','3403367','3403409','3403445','3403484','3403506','3403509','3403684','3403703','3403726','3403732','3403784','3403790','3403830','3404034','3404042','3404048','3404051','3404054','3404084','3404092','3404103','3404116','3404119','3404131','3404199','3404211','3404213','3404244','3404257','3404270','3404278','3404298','3404308','3404322','3404382','3404386','3404400','3404404','3404407','3404417','3404443','3404459','3404481','3404494','3404503','3404510','3404513','3404540','3404544','3404550','3404552','3404571','3404577','3404584','3404601','3404603','3404609','3404612','3404615','3404621','3404648','3404650','3404652','3404666','3404668','3404671','3404676','3404683','3404687','3404694','3404695','3404715','3404720','3404723','3404724','3404727','3404735','3404744','3404751','3404752','3404763','3404764','3404771','3404778','3404786','3404808','3404813','3404833','3404836','3404846','3404854','3404855','3404857','3404867','3404874','3404876','3404889','3404890','3404905','3404915','3404919','3404930','3404976','3404985','3404988','3405005','3405322','3405323','3405339','3405356','3405391','3405417','3405437','3405521','3405555','3405988','3405990','3406001','3406004','3406006','3406026','3406057','3406076','3406082','3406094','3406097','3406102','3406113','3406114','3406135','3406144','3406151','3406170','3406187','3406188','3406190','3406194','3406198','3406218','3406227','3406239','3406247','3406252','3406263','3406264','3406271','3406272','3406275','3406277','3406284','3406305','3406308','3406310','3406311','3406314','3406867','3407013','3407033','3407035','3407037','3407041','3407043','3407044','3407052','3407130','3407147','3414449','3417123','3417211','3417214','3417420','3419589','3419626','3419630','3419820','3419826','3419846','3419870','3420062','3420063','3420104','3420110','3420119','3421037','3425064','3425893','3426273','3426352','3426355','3427276','3428314','3428401','3428434','3429517','3429528','3430538','3430574','3430578','3430579','3430580','3430582','3430589','3430596','3430606','3435840','3435958','3436895','3437604','3438352','3438454','3438474','3439656','3439723','3439844','3439857','3439879','3439886','3439913','3439935','3439965','3439989','3440014','3440079','3440123','3440132','3440147','3440154','3440160','3440200','3440210','3440239','3440248','3440252','3440257','3440282','3440290','3440310','3440311','3440322','3440327','3440352','3440381','3440436','3440439','3440441','3440442','3440542','3440710','3440731','3440787','3440791','3440854','3440861','3440868','3440882','3440891','3440900','3440918','3440930','3440931','3440950','3440961','3440962','3440989','3441000','3441065','3441115','3441116','3441162','3441214','3441217','3441222','3441242','3441266','3441270','3441282','3441292','3441293','3441295','3441310','3441317','3441321','3441333','3441334','3441335','3441337','3441338','3441347','3441353','3441356','3442482','3442645','3442715','3442729','3442789','3442842','3442875','3442911','3442924','3442944','3442962','3442963','3442976','3443034','3443087','3443122','3443199','3443264','3443296','3443303','3443312','3443352','3443383','3443384','3443431','3443502','3443533','3443600','3443602','3443619','3443732','3443794','3444067','3444069','3444078','3444095','3444103','3444109','3444112','3444116','3444121','3444123','3444137','3444139','3444143','3444145','3444156','3444159','3444162','3444163','3444164','3444169','3444170','3444172','3444173','3444179','3444181','3444182','3444183','3444184','3444185','3444186','3444190','3444191','3444201','3444202','3444204','3444205','3444209','3444210','3444213','3444214','3444216','3444225','3444226','3444229','3444232','3444233','3444240','3444252','3444255','3444257','3444262','3444267','3444272','3444298','3444306','3444326','3444332','3444348','3444360','3444363','3444372','3444373','3444378','3444379','3444382','3444384','3444429','3444432','3444433','3444436','3444446','3444458','3444466','3444468','3444471','3444472','3444473','3444474','3444475','3444476','3444477','3444478','3444479','3444682','3444683','3444757','3444766','3444767','3444768','3444771','3446030','3446034','3453760','3453771','3453830','3453833','3453834','3456928','3457996','3459899','3459907','3460129','3460255','3462250','3465387','3466274','3469772','3469806','3470021','3470030','3470431','3470539','3471732','3472097','3473318','3473319','3473385','3473388','3473414','3473422','3473423','3473424','3473426','3473901','3475425','3476507','3476571','3478992','3481001','3481019','3482341','3484681','3484690','3484699','3484700','3484716','3484725','3484728','3484729','3484734','3484740','3484818','3484860','3484876','3484882','3485158','3486460','3488814','3489457','3489888','3490869','3490871','3490875','3491084' ]
, concurrency = 4;

async.eachLimit(link_ids, concurrency, function (link_id, next) {
    var url = format('http://url.goes.here/query.php?id=', link_id);
    request(url, function (err, response, body) {
        if (err) throw err;
        var $ = cheerio.load(body);
        console.log(url);
    $( "table tbody tr td table tbody tr td table tbody tr td[align='LEFT']" ).each(function() {
        fs = require('fs');
        var data_to_append = $(this).text() + ' | ';
        console.log('Model Number Saved');
    });
    $( "table tbody tr td table tbody tr td table tbody tr td:contains('Event Date')" ).each(function() {
        fs = require('fs');
        data_to_append = data_to_append + $(this).text() + ' | ';
        console.log('Event Date Saved');
    });
    $( "table tbody tr td table tbody tr td table tbody tr td:contains('Event Type')" ).each(function() {
      fs = require('fs');
        var data_to_append = data_to_append + $(this).text() + '\n';
        fs.appendFile('C:/scraper/testing.txt', data_to_append, function (err) {
              if (err) throw err;
              console.log(data_to_append + ' - APPENDED!');
        });
    });
    next();
});

});

问题是脚本将遍历循环中的每个link_id HOWEVER,它只会进展到console.log(url);,然后跳到next()。这两个操作之间没有数据保存和控制台登录。

鉴于我对此很陌生,我很确定我犯了一个语法错误或一些同样愚蠢的错误 - 但是因为我没有看到任何错误信息 - 我不确定它会是什么。< / p>

感谢您的帮助。

1 个答案:

答案 0 :(得分:0)

而不是if (err) throw err;,而if (err) return next(err);您需要在所有情况下调用next回调,以便异步知道此工作已完成。一般情况下,异步对try / catch / throw不起作用,所以坚持使用回调。

这是你的代码片段,调整调用next();的地方以及如何处理错误。

var fs = require('fs');// do requires at the top

async.eachLimit(link_ids, concurrency, function (link_id, next) {
    var url = format('http://url.goes.here/query.php?id=', link_id);
    request(url, function (err, response, body) {
        if (err) return next(err);
        var $ = cheerio.load(body);
        console.log(url);
    $( "table tbody tr td table tbody tr td table tbody tr td[align='LEFT']" ).each(function() {
        fs = require('fs');
        var data_to_append = $(this).text() + ' | ';
        console.log('Model Number Saved');
    });
    $( "table tbody tr td table tbody tr td table tbody tr td:contains('Event Date')" ).each(function() {
        fs = require('fs');
        data_to_append = data_to_append + $(this).text() + ' | ';
        console.log('Event Date Saved');
    });
    $( "table tbody tr td table tbody tr td table tbody tr td:contains('Event Type')" ).each(function() {

        var data_to_append = data_to_append + $(this).text() + '\n';
        fs.appendFile('C:/scraper/testing.txt', data_to_append, function (err) {
              if (err) return next(err);
              console.log(data_to_append + ' - APPENDED!');
              next(); //call next here instead

        });
    });
    //Don't call next() here, you're not really done yet.
});