我正在尝试抓取一个无限滚动的网站。
我正在控制滚动,但是仍然滚动,直到到达网页末尾。
这是我的代码:
const puppeteer = require(“ puppeteer”);
module.exports.scraper = async (url, callBack) => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.setUserAgent(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
);
await page.setViewport({ width: 1200, height: 768 });
function wait(ms) {
return new Promise((resolve) => setTimeout(() => resolve(), ms));
}
await page.goto(`${url}/products/?department=men&l2_category=polos-t-shirts`, {
waitUntil: "networkidle0",
});
// Get the height of the rendered page
const bodyHandle = await page.$("body");
const { height } = await bodyHandle.boundingBox();
await bodyHandle.dispose();
// Scroll one viewport at a time, pausing to let content load
const viewportHeight = page.viewport().height;
let viewportIncr = 0;
while (viewportIncr + viewportHeight < height) {
await page.evaluate((_viewportHeight) => {
window.scrollBy(0, _viewportHeight);
}, viewportHeight);
await wait(1600);
viewportIncr = viewportIncr + viewportHeight;
}
let data = await page.evaluate(() => {
window.scrollTo(0, 0);
let products = [];
let productElements = document.querySelectorAll(".product-wrap");
productElements.forEach((productElement) => {
let productJson = {};
try {
productJson.imageUrl = productElement.querySelector(".renderedImg").src;
productJson.brandName = productElement.querySelector(
".brand-name",
).innerText;
} catch (e) {
console.log(e);
}
products.push(productJson);
});
return products;
});
await wait(100);
callBack(data, true);
await browser.close();
};
在这种情况下如何抓取?
答案 0 :(得分:0)
这是处理无限滚动的一种策略。它在循环中重复滚动/比较,直到滚动无效为止。即,当我们告诉它滚动时,但仍与上次迭代时使用相同的scrollTop值时,请认为已完成。在极端情况下,浏览器最终将耗尽堆内存并崩溃,但这是我们针对普通站点的起点:
const puppeteer = require('puppeteer');
const url = 'https://example.com';
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('console', async msg => {
const args = msg.args();
const vals = [];
for (let i = 0; i < args.length; i++) {
vals.push(await args[i].jsonValue());
}
console.log(vals.join('\t'));
});
await page.goto(url);
await page.evaluate(()=> {
const wait = (duration) => {
console.log('waiting', duration);
return new Promise(resolve => setTimeout(resolve, duration));
};
(async () => {
window.atBottom = false;
const scroller = document.documentElement; // usually what you want to scroll, but not always
let lastPosition = -1;
while(!window.atBottom) {
scroller.scrollTop += 1000;
// scrolling down all at once has pitfalls on some sites: scroller.scrollTop = scroller.scrollHeight;
await wait(300);
const currentPosition = scroller.scrollTop;
if (currentPosition > lastPosition) {
console.log('currentPosition', currentPosition);
lastPosition = currentPosition;
}
else {
window.atBottom = true;
}
}
console.log('Done!');
})();
});
await page.waitForFunction('window.atBottom == true', {
timeout: 900000,
polling: 1000 // poll for finish every second
});
await page.screenshot({path: 'example.png', fullPage: true});
await browser.close();
})();