我正在使用Node和Puppeteer开发抓取/抓取工具。我正在尝试同时运行多个搜寻器。每个人访问一个不同的站点,登录并执行操作。最后,它们全部返回一个包含数据的json。发生错误时,该方法会在字段上返回一个带有“ N / A”的json。
主文件:
var a = require('./a');
var b = require('./b');
var c= require('./c');
var d= require('./d');
const http = require('http');
var url = require("url");
http.createServer((request, response) => {
if (request.method === 'GET') {
var urlObj = url.parse(request.url, true);
// cnpj is the company id I want my scrapper to look for.
if (urlObj['query']['cnpj'] != undefined) {
var cnpj = urlObj['query']['cnpj'];
response.setHeader('Content-Type', 'application/json');
response.writeHeader(200);
getCotacao(cnpj).then(ret => {
response.end(JSON.stringify(ret));
})
}
} else {
response.statusCode = 404;
response.end();
}
}).listen(8080, 'localhost');
async function getCotacao(cnpj) {
var returnCount = 4;
var sleepCycle = 0;
var ret = [];
// run all of them
a(cnpj).then(response => { console.log('Ended a:' + JSON.stringify(response)); ret = ret.concat(response); returnCount--; });
b(cnpj).then(response => { console.log('Ended b:' + JSON.stringify(response)); ret = ret.concat(response); returnCount--; });
c(cnpj).then(response => { console.log('Ended: c:' + JSON.stringify(response)) ; ret = ret.concat(response); returnCount--; });
d(cnpj).then(response => { console.log('Ended d:' + JSON.stringify(response)) ; ret = ret.concat(response); returnCount--; });
//wait them to finish or not
while (returnCount > 0 && sleepCycle < 90) { // 1:30m timeout. After the timeout or all promises returned, end the code. Which ever comes first
await sleep(1000).then(function () {
sleepCycle++;
console.log("sleepCycle: " + sleepCycle);
});
}
return ret;
}
我的a,b,c方法看起来都一样:
var puppeteer = require('puppeteer');
const a= async (cnpj) => {
var browser;
try {
var timeout = 3000; // 7 segundos
browser = await puppeteer.launch(
{ headless: true
}
);
const page = await browser.newPage();
await page.goto('a_website.com', {waitUntil: 'load', timeout: timeout});
await page.waitForSelector('#txtLogin', {timeout: timeout});
await page.focus('#txtLogin');
await page.keyboard.type('my_login');
await page.focus('#txtSenha');
await page.keyboard.type('my_pass');
await page.click('#btnLogin');
await page.waitForSelector('#btnSic', {timeout: timeout});
await page.click('#btnSic');
await page.waitForSelector('#txtCnpj', {timeout: timeout});
await page.focus('#txtCnpj');
await page.keyboard.type(cnpj);
await page.click('#btnAnalisar');
await page.waitForSelector('#Finaliza', {timeout: timeout});
const CreditLimit = await page.evaluate(() => document.querySelector('#Finaliza > table:nth-child(3) > tbody > tr:nth-child(5) > td:nth-child(2) > span').textContent);
const RiskRate = await page.evaluate(() => document.querySelector('#Finaliza > table:nth-child(3) > tbody > tr:nth-child(6) > td:nth-child(2) > span').textContent);
const limitAvailable = "";
data = {
'a':
{
CreditLimit,
RiskRate,
limitAvailable
}
}
await browser.close();
return data;
} catch (e) {
console.log(e);
data = {
'a': {
'CreditLimit': 'N/D',
'limitAvailable': 'N/D',
'RiskRate': 'N/D'
}
}
if (browser)
await browser.close();
return data;
}
};
module.exports = a;
问题:有时(并非总是:)),当我运行它时,其中一个方法(a,b,c或d)获得伪指令超时并完成,接近于另一个方法也完成的时间,两者在.then()子句上获得相同的返回值。日志为:
sleepCycle: 1
sleepCycle: 2
sleepCycle: 3
sleepCycle: 4
{ TimeoutError: Navigation Timeout Exceeded: 3000ms exceeded
at Promise.then (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\LifecycleWatcher.js:142:21)
-- ASYNC --
at Frame.<anonymous> (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\helper.js:111:15)
at Page.goto (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\Page.js:674:49)
at Page.<anonymous> (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\helper.js:112:23)
at a (C:\Users\fabyt\PhpstormProjects\robo_garantia\src\a.js:15:24)
at process._tickCallback (internal/process/next_tick.js:68:7) name: 'TimeoutError' }
{ TimeoutError: Navigation Timeout Exceeded: 3000ms exceeded
at Promise.then (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\LifecycleWatcher.js:142:21)
-- ASYNC --
at Frame.<anonymous> (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\helper.js:111:15)
at Page.goto (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\Page.js:674:49)
at Page.<anonymous> (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\helper.js:112:23)
at b(C:\Users\fabyt\PhpstormProjects\robo_garantia\src\b.js:21:20)
at process._tickCallback (internal/process/next_tick.js:68:7) name: 'TimeoutError' }
sleepCycle: 5
{ TimeoutError: waiting for selector "#idToken1" failed: timeout 3000ms exceeded
at new WaitTask (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\DOMWorld.js:561:28)
at DOMWorld._waitForSelectorOrXPath (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\DOMWorld.js:490:22)
at DOMWorld.waitForSelector (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\DOMWorld.js:444:17)
at Frame.waitForSelector (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\FrameManager.js:628:47)
at Frame.<anonymous> (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\helper.js:112:23)
at Page.waitForSelector (C:\Users\fabyt\PhpstormProjects\robo_garantia\node_modules\puppeteer\lib\Page.js:1089:29)
at c(C:\Users\fabyt\PhpstormProjects\robo_garantia\src\c.js:16:24)
at process._tickCallback (internal/process/next_tick.js:68:7) name: 'TimeoutError' }
Ended a:{"c":{"CreditLimit":"N/D","limitAvailable":"N/D","RiskRate":"N/D"}}
Ended b:{"c":{"CreditLimit":"N/D","limitAvailable":"N/D","RiskRate":"N/D"}}
Ended c:{"c":{"CreditLimit":"N/D","limitAvailable":"N/D","RiskRate":"N/D"}}
sleepCycle: 6
sleepCycle: 7
sleepCycle: 8
sleepCycle: 9
sleepCycle: 10
Ended d:{"d":{"CreditLimit":"1.000.000.00","limitAvailable":"R$ 1,000,000.00","RiskRate":"1.5000"}}
sleepCycle: 11
重要的部分在这里:
Ended a:{"c":{"CreditLimit":"N/D","limitAvailable":"N/D","RiskRate":"N/D"}}
Ended b:{"c":{"CreditLimit":"N/D","limitAvailable":"N/D","RiskRate":"N/D"}}
Ended c:{"c":{"CreditLimit":"N/D","limitAvailable":"N/D","RiskRate":"N/D"}}
请注意,方法a,b和c的.then()子句都从方法c返回了相同的json。就像某种方法可以访问其他方法一样,返回promise。如果操纵up的代码没有破坏每个方法,则返回正确的json。仅当一种或多种然后一种方法超时时,才会出现此问题。我该如何解决?
答案 0 :(得分:1)
您在模块内部的变量data
尚未在任何地方声明,因此它是全局的。所有模块都访问相同的内存区域,因此值将被覆盖。