Puppeteer和jnode多个异步调用在相同时间结束时返回相同结果

时间:2019-09-13 20:24:40

标签: node.js async-await puppeteer

我正在使用Node和Puppeteer开发抓取/抓取工具。我正在尝试同时运行多个搜寻器。每个人访问一个不同的站点,登录并执行操作。最后,它们全部返回一个包含数据的json。发生错误时,该方法会在字段上返回一个带有“ N / A”的json。

主文件:

var a = require('./a');
var b = require('./b');
var c= require('./c');
var d= require('./d');

const http = require('http');
var url = require("url");

http.createServer((request, response) => {
    if (request.method === 'GET') {
        var urlObj = url.parse(request.url, true);

       // cnpj is the company id I want my scrapper to look for.
        if (urlObj['query']['cnpj'] != undefined) {
            var cnpj = urlObj['query']['cnpj'];

            response.setHeader('Content-Type', 'application/json');
            response.writeHeader(200);

            getCotacao(cnpj).then(ret => {
                response.end(JSON.stringify(ret));
            })

        }
    } else {
        response.statusCode = 404;
        response.end();
    }
}).listen(8080, 'localhost');



async function getCotacao(cnpj) {
    var returnCount = 4;
    var sleepCycle = 0;

    var ret = [];

// run all of them
        a(cnpj).then(response => { console.log('Ended a:' + JSON.stringify(response)); ret = ret.concat(response); returnCount--; });

    b(cnpj).then(response => {   console.log('Ended b:' + JSON.stringify(response)); ret = ret.concat(response); returnCount--;   });

    c(cnpj).then(response => {   console.log('Ended: c:' + JSON.stringify(response)) ; ret = ret.concat(response); returnCount--;  });

    d(cnpj).then(response => {    console.log('Ended d:' + JSON.stringify(response)) ; ret = ret.concat(response); returnCount--;   });

    //wait them to finish or not
    while (returnCount > 0 && sleepCycle < 90) { // 1:30m timeout. After the timeout or all promises returned, end the code. Which ever comes first 
        await sleep(1000).then(function () {
            sleepCycle++;
            console.log("sleepCycle: " + sleepCycle);
        });
    }

    return ret;
}

我的a,b,c方法看起来都一样:

var puppeteer = require('puppeteer');

const a= async (cnpj) => {
        var browser;
        try {
            var timeout = 3000; // 7 segundos
            browser = await puppeteer.launch(
                {   headless: true
                }
            );
            const page = await browser.newPage();
            await page.goto('a_website.com', {waitUntil: 'load', timeout: timeout});
            await page.waitForSelector('#txtLogin', {timeout: timeout});
            await page.focus('#txtLogin');
            await page.keyboard.type('my_login');
            await page.focus('#txtSenha');
            await page.keyboard.type('my_pass');
            await page.click('#btnLogin');
            await page.waitForSelector('#btnSic', {timeout: timeout});
            await page.click('#btnSic');
            await page.waitForSelector('#txtCnpj', {timeout: timeout});
            await page.focus('#txtCnpj');
            await page.keyboard.type(cnpj);
            await page.click('#btnAnalisar');

            await page.waitForSelector('#Finaliza', {timeout: timeout});

            const CreditLimit = await page.evaluate(() => document.querySelector('#Finaliza > table:nth-child(3) > tbody > tr:nth-child(5) > td:nth-child(2) > span').textContent);
            const RiskRate = await page.evaluate(() => document.querySelector('#Finaliza > table:nth-child(3) > tbody > tr:nth-child(6) > td:nth-child(2) > span').textContent);
            const limitAvailable = "";

            data = {
                'a':
                    {
                        CreditLimit,
                        RiskRate,
                        limitAvailable
                    }
            }


            await browser.close();
            return data;
        } catch (e) {
            console.log(e);
            data = {
                'a': {
                    'CreditLimit': 'N/D',
                    'limitAvailable': 'N/D',
                    'RiskRate': 'N/D'
                }
            }
            if (browser)
                await browser.close();

            return data;
        }
};

module.exports = a;

问题:有时(并非总是:)),当我运行它时,其中一个方法(a,b,c或d)获得伪指令超时并完成,接近于另一个方法也完成的时间,两者在.then()子句上获得相同的返回值。日志为:

sleepCycle: 1
sleepCycle: 2
sleepCycle: 3
sleepCycle: 4
{ TimeoutError: Navigation Timeout Exceeded: 3000ms exceeded
    at Promise.then (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\LifecycleWatcher.js:142:21)
  -- ASYNC --
    at Frame.<anonymous> (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\helper.js:111:15)
    at Page.goto (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\Page.js:674:49)
    at Page.<anonymous> (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\helper.js:112:23)
    at a (C:\Users\fabyt\PhpstormProjects\robo_garantia\src\a.js:15:24)
    at process._tickCallback (internal/process/next_tick.js:68:7) name: 'TimeoutError' }
{ TimeoutError: Navigation Timeout Exceeded: 3000ms exceeded
    at Promise.then (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\LifecycleWatcher.js:142:21)
  -- ASYNC --
    at Frame.<anonymous> (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\helper.js:111:15)
    at Page.goto (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\Page.js:674:49)
    at Page.<anonymous> (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\helper.js:112:23)
    at b(C:\Users\fabyt\PhpstormProjects\robo_garantia\src\b.js:21:20)
    at process._tickCallback (internal/process/next_tick.js:68:7) name: 'TimeoutError' }
sleepCycle: 5
{ TimeoutError: waiting for selector "#idToken1" failed: timeout 3000ms exceeded
    at new WaitTask (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\DOMWorld.js:561:28)
    at DOMWorld._waitForSelectorOrXPath (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\DOMWorld.js:490:22)
    at DOMWorld.waitForSelector (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\DOMWorld.js:444:17)
    at Frame.waitForSelector (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\FrameManager.js:628:47)
    at Frame.<anonymous> (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\helper.js:112:23)
    at Page.waitForSelector (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\Page.js:1089:29)
    at c(C:\Users\fabyt\PhpstormProjects\robo_garantia\src\c.js:16:24)
    at process._tickCallback (internal/process/next_tick.js:68:7) name: 'TimeoutError' }
Ended a:{"c":{"CreditLimit":"N/D","limitAvailable":"N/D","RiskRate":"N/D"}}
Ended b:{"c":{"CreditLimit":"N/D","limitAvailable":"N/D","RiskRate":"N/D"}}
Ended c:{"c":{"CreditLimit":"N/D","limitAvailable":"N/D","RiskRate":"N/D"}}
sleepCycle: 6
sleepCycle: 7
sleepCycle: 8
sleepCycle: 9
sleepCycle: 10
Ended d:{"d":{"CreditLimit":"1.000.000.00","limitAvailable":"R$ 1,000,000.00","RiskRate":"1.5000"}}
sleepCycle: 11

重要的部分在这里:

Ended a:{"c":{"CreditLimit":"N/D","limitAvailable":"N/D","RiskRate":"N/D"}}
Ended b:{"c":{"CreditLimit":"N/D","limitAvailable":"N/D","RiskRate":"N/D"}}
Ended c:{"c":{"CreditLimit":"N/D","limitAvailable":"N/D","RiskRate":"N/D"}}

请注意,方法a,b和c的.then()子句都从方法c返回了相同的json。就像某种方法可以访问其他方法一样,返回promise。如果操纵up的代码没有破坏每个方法,则返回正确的json。仅当一种或多种然后一种方法超时时,才会出现此问题。我该如何解决?

1 个答案:

答案 0 :(得分:1)

您在模块内部的变量data尚未在任何地方声明,因此它是全局的。所有模块都访问相同的内存区域,因此值将被覆盖。