如何使用puppeteer抓取无限滚动网站

时间:2020-10-06 14:07:58

标签: node.js web-scraping puppeteer

我正在尝试抓取一个无限滚动的网站。

我正在控制滚动,但是仍然滚动,直到到达网页末尾。

这是我的代码:

const puppeteer = require(“ puppeteer”);

module.exports.scraper = async (url, callBack) => {
    const browser = await puppeteer.launch({ headless: false });
    const page = await browser.newPage();

    await page.setUserAgent(
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    );

    await page.setViewport({ width: 1200, height: 768 });

    function wait(ms) {
        return new Promise((resolve) => setTimeout(() => resolve(), ms));
    }

    await page.goto(`${url}/products/?department=men&l2_category=polos-t-shirts`, {
        waitUntil: "networkidle0",
    });

    // Get the height of the rendered page
    const bodyHandle = await page.$("body");
    const { height } = await bodyHandle.boundingBox();
    await bodyHandle.dispose();

    // Scroll one viewport at a time, pausing to let content load
    const viewportHeight = page.viewport().height;
    let viewportIncr = 0;
    while (viewportIncr + viewportHeight < height) {
        await page.evaluate((_viewportHeight) => {
            window.scrollBy(0, _viewportHeight);
        }, viewportHeight);
        await wait(1600);
        viewportIncr = viewportIncr + viewportHeight;
    }

    let data = await page.evaluate(() => {
        window.scrollTo(0, 0);
        let products = [];
        let productElements = document.querySelectorAll(".product-wrap");

        productElements.forEach((productElement) => {
            let productJson = {};
            try {
                productJson.imageUrl = productElement.querySelector(".renderedImg").src;
                productJson.brandName = productElement.querySelector(
                    ".brand-name",
                ).innerText;
            } catch (e) {
                console.log(e);
            }
            products.push(productJson);
        });
        return products;
    });
    await wait(100);
    callBack(data, true);
    await browser.close();
};

在这种情况下如何抓取?

1 个答案:

答案 0 :(得分:0)

这是处理无限滚动的一种策略。它在循环中重复滚动/比较,直到滚动无效为止。即,当我们告诉它滚动时,但仍与上次迭代时使用相同的scrollTop值时,请认为已完成。在极端情况下,浏览器最终将耗尽堆内存并崩溃,但这是我们针对普通站点的起点:

const puppeteer = require('puppeteer');
const url = 'https://example.com';

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  page.on('console', async msg => {
    const args = msg.args();
    const vals = [];
    for (let i = 0; i < args.length; i++) {
      vals.push(await args[i].jsonValue());
    }
    console.log(vals.join('\t'));
  });
  await page.goto(url);
  await page.evaluate(()=> {
    
    const wait = (duration) => { 
      console.log('waiting', duration);
      return new Promise(resolve => setTimeout(resolve, duration)); 
    };

    (async () => {
      
      window.atBottom = false;
      const scroller = document.documentElement;  // usually what you want to scroll, but not always
      let lastPosition = -1;
      while(!window.atBottom) {
        scroller.scrollTop += 1000;
        // scrolling down all at once has pitfalls on some sites: scroller.scrollTop = scroller.scrollHeight;
        await wait(300);
        const currentPosition = scroller.scrollTop;
        if (currentPosition > lastPosition) {
          console.log('currentPosition', currentPosition);
          lastPosition = currentPosition;
        }
        else {
          window.atBottom = true;
        }
      }
      console.log('Done!');

    })();

  });

  await page.waitForFunction('window.atBottom == true', {
    timeout: 900000,
    polling: 1000 // poll for finish every second
  });

  await page.screenshot({path: 'example.png', fullPage: true});

  await browser.close();
})();