我有一个带有puppeteer,DiscordJS和NodeJS的机器人抓取程序,并且我有一个无限循环,其中setInterval每隔10s,但是setInterval所做的每个循环,它为我提供了每个循环的每个数据,所以我想知道如何我只获取最后一个周期的最后一个数据,而不是每个周期。
const puppeteer = require('puppeteer');
const Discord = require('discord.js');
const client = new Discord.Client();
const url = 'url to scrap';
var clocks = [];
(async () => {
const URL = url
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(URL, { 'waitUntil' : 'networkidle2' });
setInterval(async () => {
let clock = await page.evaluate(()=>{
var a = document.getElementById("task-listing-datatable").getAttribute("data-tasks");
var ar = eval(a);
var keyword = ['asdad', 'asdakdada', 'mama', 'Duplicate Fashion Product Identification Task'];
for(let i=0; i<ar.length; i++){
for(let j=0; j<keyword.length; j++){
if(ar[i][1] === keyword[j]){
let job = (`${ar[i][1]}`);
return (`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`);
}
}
}
});
console.log(`==== first login ====`)
console.log(`==================`)
if(!clocks.includes(clock)) {
client.on('message', (message)=>{
if(message.author.bot === false) {
message.channel.send(clock);
}
});
clocks.push(clock);
// Save the clock so you will remember it next time.
}
await page.reload();
}, 8000)
})()
client.login('discordjs token');
这是显示消息的方式:
如您所见,现在它为每个更改提供的不是每个周期的所有数据 enter image description here
答案 0 :(得分:1)
每次setInterval
运行时,它都会刷新页面,在“时钟”中收集信息,然后通过不和谐发送。问题是,它不知道它已经发送给您什么,因此您每次都将获得一些相同的数据。
解决方案是保存找到的数据,然后仅在当前数据批次与所有先前数据都不相同时才创建不和谐消息。
因此,您需要某种数据存储:
var clocks = [];
(async () => {
setInterval(async () => {
const URL = url
const browser = await puppeteer.launch()
// ...
然后返回当前时钟后,您要检查它是否不在数据存储中。
if(!clocks.includes(clock)) {
如果不是,那么您就知道要发送新数据。
if(!clocks.includes(clock)) {
client.on('message', (message)=>{
message.channel.send(clock);
});
clocks.push(clock); // Save the clock so you will remember it next time.
}
因此,总的来说,您会得到类似的东西:
var clocks = [];
(async () => {
setInterval(async () => {
const URL = url
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(URL, { 'waitUntil' : 'networkidle2' })
let clock = await page.evaluate(()=>{
var a = document.getElementById("task-listing-datatable").getAttribute("data-tasks");
var ar = eval(a);
var keyword = ['asdad', 'asdakdada', 'mama', 'What Is The Best Dialogue Category About Phones'];
for(let i=0; i<ar.length; i++){
for(let j=0; j<keyword.length; j++){
if(ar[i][1] === keyword[j]){
let job = (`${ar[i][1]}`);
return (`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`);
}
}
}
});
console.log(`==== first login ====`)
console.log(`==================`)
if(!clocks.includes(clock)) {
client.on('message', (message)=>{
message.channel.send(clock);
});
clocks.push(clock); // Save the clock so you will remember it next time.
}
await page.reload();
console.log(`after reload`)
}, 8000)
})()
尽管如此,我们没有真正的理由每10秒启动一个新的浏览器窗口,但在计算机上加载一次页面然后每10秒刷新一次可能会更容易。
var clocks = [];
(async () => {
const URL = url
const browser = await puppeteer.launch()
const page = await browser.newPage()
await page.goto(URL, { 'waitUntil' : 'networkidle2' });
setInterval(async () => {
let clock = await page.evaluate(()=>{
var a = document.getElementById("task-listing-datatable").getAttribute("data-tasks");
var ar = eval(a);
var keyword = ['asdad', 'asdakdada', 'mama', 'What Is The Best Dialogue Category About Phones'];
for(let i=0; i<ar.length; i++){
for(let j=0; j<keyword.length; j++){
if(ar[i][1] === keyword[j]){
let job = (`${ar[i][1]}`);
return (`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`);
}
}
}
});
console.log(`==== first login ====`)
console.log(`==================`)
if(!clocks.includes(clock)) {
client.on('message', (message)=>{
message.channel.send(clock);
});
clocks.push(clock); // Save the clock so you will remember it next time.
}
await page.reload();
}, 8000)
})()
现在,要确保您的页面功能(时钟)每次都找到一个新的数据点,我们需要将过去的数据点传递给它:
let clock = await page.evaluate(clocks=>{
// ...
}, clocks);
现在,在页面功能内部,您可以访问旧数据点。
代替
if(ar[i][1] === keyword[j]){
let job = (`${ar[i][1]}`); // What is this for?
return (`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`);
}
检查数据点是否存在于您的Clocks数组中,并且仅在新数据点时才返回。
if(ar[i][1] === keyword[j]){
let dataPoint =`${ar[i][0]} ${ar[i][1]} Paga ${ar[i][3]} Tareas: ${ar[i][5]}`;
if(!clocks.includes(dataPoint)){
return dataPoint;
}
}