我一直在使用headless-chrome(puppeteer.js)进行网页抓取/抓取。目标数据量很大(大约3,000~20,000页),因此我创建了node.js抓取应用程序,该应用程序使用多个chrome选项卡同时进行多次异步抓取。这里的问题是应用程序随机停止打开目标页面,只返回标记为await
值的结果。我不得不从headless-chrome中删除超时选项,因为它导致应用程序停止整个抓取,返回reject
错误。
总之,当应用程序刮擦100~300页并停止打开目标页面并且不返回await
结果时,应用程序随机停止,因此应用程序等待resolve
无限时间。我认为这可能是一个错误,因为它是随机发生的。当我打开少于100个镀铬标签时,它不会发生(但同时最多可以打开6到7个窗口)。当应用程序被同步编程时也不会发生这种情况。(我上次用较少量的数据集做了类似的事情。因为数据集很小,我只是使整个应用程序同步并等待其结果)
问题
如果页面状态未正确打开,我怎么知道?我可以让应用程序忽略'没有响应的任务?(同时保持超时选项)
应用结构
//main settings...
const puppeteer = require('puppeteer')
//limited data amount by redirecting to different URL
const targetUrl = `http://www.samplesite.com/board/lists/?id=test&exception_mode=recommend&page=`
const pageMin = 1
const pageMax = 403 //each page has roughly 20~30 elements to scrape
//!!!detrimental for RAM consumption & performance
const windowMax = 7 //maximum chrome window opened at once
...
//main scraping function
async function scrapePage(browser,targetUrl,pageNo){
try{
const page = await browser.newPage() // <--sometimes app just doesn't open up a new window and infinitely be on hold....
console.log(`browsing page ${pageNo} ...`)
await page.goto(targetUrl + pageNo, {timeout: 3000000})
//prolonged the timeout to prevent sudden stop of task
let titles = await page.evaluate((selector)=> {
return [...document.querySelectorAll(selector)].map(el=>{
return { text: el.innerText, href:el.href, comment:0 } })
}, '.t_subject > a')
}catch(e){
console.log(e)
}
if(page) await page.close()
return new Promise((resolve,reject)=>resolve(convertedTitles))
}
...
async main(){
const browser = await puppeteer.launch({
headless:false //set headelss:false for gui debug
,args: ['--no-sandbox', '--disable-setuid-sandbox'] //for prevent RAM errors
})
let countLimit = 0
let pageScrapePlan = []
let scraped = [] // <------final result
for(let i=pageMin;i<=pageMax;i++){
console.log(`crawling on page ${i} / ${pageMax} ---- ${(i / pageMax * 100).toFixed(2)}%`)
pageScrapePlan.push(scrapePage(browser,targetUrl,i))
countLimit++
if(countLimit % windowMax === 0 || i === pageMax) {
//if the counter reaches the maximum number of windows, run scraping at once
let resPage = await Promise.all(pageScrapePlan)
resPage.map(elScrappedFromPage=>{
scraped = {...scraped,...elScrappedFromPage}
})
pageScrapePlan = []
}
await browser.close()
}
main()
//to catch unhandled rejection errors with line number
process.on('unhandledRejection', up => { throw up });