我制作了一个作业脚本,用于定期进行网页抓取,并将一些信息保存在MongoDB数据库中。我试图获得尽可能多的性能,现在,我能够每10秒执行一次脚本。但是,我想进一步减少它,如果可能的话,在1-10秒之间。问题是,当我减少它时,我的代码将引发以下警告,并且某些执行无法解决:
(node:9472) MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 exit listeners added. Use emitter.setMaxListeners() to increase limit
有没有办法改进代码?
const $ = require('cheerio');
const MarketModel = require('./models/marketModel');
const mongoose = require('mongoose');
const puppeteer = require('puppeteer');
var schedule = require('node-schedule');
const {
Cluster
} = require('puppeteer-cluster');
//Connection to DataBase:
mongoose.connect('mongodb://localhost:27017/Tradheo', {
useNewUrlParser: true
});
mongoose.connection.on('error', error => console.log(error));
mongoose.Promise = global.Promise;
getMarketData = async () => {
console.log("Web scraping to get market data...")
let markets = []
let marketSpain = {
country: 'Spain',
name: 'IBEX 35',
companies: []
}
let marketGermany = {
country: 'Germany',
name: 'DAX',
companies: []
}
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
maxConcurrency: 2,
});
await cluster.task(async ({
page,
data: url
}) => {
await page.goto({
waitUntil: 'domcontentloaded'
});
await page.setRequestInterception(true);
page.on('request', request => {
if (request.resourceType() === 'document') {
request.continue();
} else {
request.abort();
}
});
const html = await page.content();
if (url === 'https://uk.investing.com/equities/spain') {
console.log('Spain data page content loaded');
$("table[class='genTbl closedTbl crossRatesTbl elpTbl elp30'] > tbody > tr", html).each((i, elem) => {
marketSpain.companies.push({
name: $("td[class='bold left noWrap elp plusIconTd'] > a", html).eq(i).html(),
last: $("td", elem).eq(2).text(),
high: $("td", elem).eq(3).text(),
low: $("td", elem).eq(4).text(),
change: $("td", elem).eq(5).text(),
changePerCent: $("td", elem).eq(6).text(),
volume: $("td", elem).eq(7).text(),
time: $("td", elem).eq(8).text(),
purchase: false,
sale: false
});
});
markets.push(marketSpain);
} else {
console.log('Germany data page content loaded');
$("table[class='genTbl closedTbl crossRatesTbl elpTbl elp30'] > tbody > tr", html).each((i, elem) => {
marketGermany.companies.push({
name: $("td[class='bold left noWrap elp plusIconTd'] > a", html).eq(i).html(),
last: $("td", elem).eq(2).text(),
high: $("td", elem).eq(3).text(),
low: $("td", elem).eq(4).text(),
change: $("td", elem).eq(5).text(),
changePerCent: $("td", elem).eq(6).text(),
volume: $("td", elem).eq(7).text(),
time: $("td", elem).eq(8).text(),
purchase: false,
sale: false
});
});
markets.push(marketGermany);
}
if (markets.length === 2) {
MarketModel.create({
markets,
}, (err) => {
if (err) return handleError(err);
})
console.log("Done!")
}
});
cluster.queue(url1);
cluster.queue(url2);
await cluster.idle();
await cluster.close();
}
var j = schedule.scheduleJob('*/10 * 8-17 * * 1-5', function () {
const now = new Date();
//Checks that time is between 8:30 - 17:35 (schedule of the stock exchange)
if (now.getHours() >= 8 && !(now.getHours() == 8 && now.getMinutes() < 30) && now.getHours() <= 17 && !(now.getHours() == 17 && now.getMinutes() > 35)) {
getMarketData();
}
});
更新:我添加了一些改进,例如将waitUntil属性设置为'domcontentloaded'并请求拦截以避免等待图像以及除html内容之外的任何类型的资源被加载。但是,似乎不足以实现目标。