使用Puppeteer,如何抓取。更方便易读

时间:2018-12-04 07:35:42

标签: web-crawler puppeteer

我开始使用Puppeteer爬行网络。 我成功地在两个购物中心(Imvely,Naning)爬网。下面是代码。

app.js

const express = require('express');
const Crawling = require("./crawling/Crawling");
const BrandList = require("./brand/BrandList");

for (let key in BrandList) {
    Crawling(BrandList[key].brand, BrandList[key].brandUrl, BrandList[key].productList, BrandList[key].product, BrandList[key].title, BrandList[key].price, BrandList[key].image, BrandList[key].productUrl);
}

const app = express();

app.listen(4000, () => {
    console.log("Start!!!!!!!");
});

Crawling.js

const puppeteer = require('puppeteer');
const fs = require('fs');

const Crawling = async (brand, brandUrl, productList, product, productTitle, productPrice, productImage, productUrl) => {

    const browser = await puppeteer.launch();
    const page = await browser.newPage();

    await page.goto(`${brandUrl}`);

    await page.waitForSelector(productList);
    await page.waitFor(5000);
    await page.addScriptTag({url: 'https://code.jquery.com/jquery-3.2.1.min.js'});

    const result = await page.evaluate(({ product, productTitle, productPrice, productImage, productUrl }) => {

        const  result= [];

        // Problems
        $(product).each(function () {

            const title = $.trim($(this).find(productTitle).text());
            const price = $(this).find(productPrice).text();
            const image = $(this).find(productImage).attr('src');
            const url = $(this).find(productUrl).attr('href');

            result.push({ title, price, image, url })

        });

        return result;

    }, { product, productTitle, productPrice, productImage, productUrl });

    if (await write_file(`./db/${brand}.json`, JSON.stringify(result)) === false) {
        console.error('Error: Unable to write stores to example.json');
    }

    await browser.close();

};

const write_file = (file, data) => new Promise((resolve, reject) => {
    fs.writeFile(file, data, 'utf8', error => {
        if (error) {
            console.error(error);
            reject(false);
        } else {
            resolve(true);
        }
    });
});

module.exports = Crawling;

BrandList.js

const BrandList = {
    Imvely : {
        brand: "Imvely",
        brandUrl: "http://imvely.com/product/list.html?cate_no=72",
        productList: "ul.prdList",
        product: "ul.prdList > li.item",
        title: "p.name span",
        price: ".xans-product-listitem > li:eq(1) > span",
        image: ".prdImg > a > img",
        productUrl: ".prdImg > a"
    },
    Naning : {
        brand: "Naning",
        brandUrl: "https://www.naning9.com/shop/listplace.php",
        productList: "div.item_list",
        product: "div.item_list > div.list_cell",
        title: ".item_text > .item_name",
        price: ".item_price > p.inblock.dis",
        image: "a > img",
        productUrl: "a"
    }
};

module.exports = BrandList;

我认为这还不够。我不知道到底是什么问题,我想改善我的抓取代码。更加方便和可读。

例如,我想知道当有更多品牌时是否存在问题。

那么,我该如何改善?

0 个答案:

没有答案