我为实时体育比分制作了一个网络抓取工具。我有2个文件index.js充当服务器,还有一个scraper.js文件负责抓取。这是我的index.js文件代码:
const express = require('express');
const scraper = require('./util/scraper');
const app = express();
app.get('/scores', (req, res) => {
const scores = new Promise((resolve, reject) => {
scraper
.scrapeLiveScores()
.then(data => {
resolve(data)
}).catch(err => reject('scores scrape failed'))
});
// res.send(JSON.stringify(scores));
// Use promise.all([]) if more than one scraper is used and list each of them
Promise.resolve(scores)
.then(data => {
res.send(JSON.stringify(data));
}).catch(function() {
console.log("promise rejected");
});
});
app.set( 'port', ( process.env.PORT || 5000 ));
// Start node server
app.listen( app.get( 'port' ), function() {
console.log( 'Node server is running on port ' + app.get( 'port' ));
});
module.exports = app;
当我在本地运行heroku时,它返回所需数据的json。但是,当我将其部署到heroku时,出现控制台错误“承诺被拒绝”的应用程序错误。
const puppeteer = require('puppeteer');
const scrapeLiveScores = async () => {
try{
//Open the browser
var browser = await puppeteer.launch({ headless: true });
//Open a new page
var page = await browser.newPage();
//Enter url in the page
await page.goto('https://www.flashscore.com.au/basketball/usa/nba/');
//wait for selector to load in
await page.waitForSelector('div.event__score.event__score--home');
var scores = await page.evaluate(() => {
var basePath = 'div.leagues--live > div > div.event__match';
var homeScore = document.querySelectorAll('div.leagues--live > div > div.event__match > div.event__score--home');
var awayScore = document.querySelectorAll('div.leagues--live > div > div.event__match > div.event__score--away');
var homeTeam = document.querySelectorAll('div.leagues--live > div > div.event__match > div.event__participant.event__participant--home');
var awayTeam = document.querySelectorAll('div.leagues--live > div > div.event__match > div.event__participant.event__participant--away');
var stages = document.querySelectorAll('div.leagues--live > div > div.event__match > div.event__stage > div');
//#g_3_6LLArB7N > div.event__stage > div
var scoresArray = [];
for(var i = 0; i < homeScore.length; i++){
if(stages[i] != null){
scoresArray[i] = {
homeTeam: homeTeam[i].innerText.trim(),
homeScore: homeScore[i].innerText.trim(),
awayTeam: awayTeam[i].innerText.trim(),
awayScore: awayScore[i].innerText.trim(),
stage: stages[i].innerText.trim()
};
}else{
scoresArray[i] = {
homeTeam: homeTeam[i].innerText.trim(),
homeScore: homeScore[i].innerText.trim(),
awayTeam: awayTeam[i].innerText.trim(),
awayScore: awayScore[i].innerText.trim(),
stage: "-"
};
}
}
return scoresArray;
});
console.log(scores);
await browser.close();
return scores;
}catch(err){
await browser.close();
}
}
module.exports.scrapeLiveScores = scrapeLiveScores;
我已经坚持了几个小时。这是日志:
2019-12-30T08:06:23.178178+00:00 app[web.1]: scraper error TypeError: Cannot read property 'close' of undefined
2019-12-30T08:06:23.178216+00:00 app[web.1]: at Object.scrapeLiveScores (/app/util/scraper.js:60:23)
2019-12-30T08:06:23.178218+00:00 app[web.1]: at processTicksAndRejections (internal/process/task_queues.js:93:5)
2019-12-30T08:06:23.182971+00:00 heroku[router]: at=info method=GET path="/scores" host=nbalive-api.herokuapp.com request_id=7f5b79cc-dd2e-433b-a4db-56052f8a5cdd fwd="99.247.208.27" dyno=web.1 connect=1ms service=23ms status=500 bytes=231 protocol=https
第60行是catch(err)中的{await browswer.close()}
答案 0 :(得分:0)
要调试此问题,您需要记录所得到的实际错误。而且,在使用它时,请简化代码并删除将现有承诺包装在另一个不必要的承诺中的承诺反模式。
将代码更改为此:
app.get('/scores', (req, res) => {
scraper.scrapeLiveScores().then(data => {
res.send(data);
}).catch(function(e) {
console.log("scraper error", e);
res.status(500).send("scraper error");
});
});
然后,console.log()
中的.catch()
语句将向您显示您得到的确切错误。
除了删除promise反模式,简化代码并记录实际获得的错误并消除JSON.stringify()
自动完成的res.send()
之外,这还会在存在以下情况时发送对请求的响应错误,而不仅仅是让浏览器挂起。
答案 1 :(得分:0)
我没有使用刮板,但是我已经在Heroku上成功使用了Cheerio和Puppeteer:
const puppeteer = require('puppeteer');
const $ = require('cheerio');
exports.scraper = (req, res) => {
const url = 'https://www.my-target-url.com';
puppeteer
.launch()
.then(function(browser) {
return browser.newPage();
})
.then(function(page) {
return page.goto(url).then(function() {
return page.content();
});
})
.then(function(html) {
// target a css selector
$('#my_selector', html).each(function() {
console.log($(this).text());
res.send($(this).text());
});
})
.catch(function(err) {
//handle error
});
};