我开始使用Puppeteer爬行网络。 我成功地在两个购物中心(Imvely,Naning)爬网。下面是代码。
app.js
const express = require('express');
const Crawling = require("./crawling/Crawling");
const BrandList = require("./brand/BrandList");
for (let key in BrandList) {
Crawling(BrandList[key].brand, BrandList[key].brandUrl, BrandList[key].productList, BrandList[key].product, BrandList[key].title, BrandList[key].price, BrandList[key].image, BrandList[key].productUrl);
}
const app = express();
app.listen(4000, () => {
console.log("Start!!!!!!!");
});
Crawling.js
const puppeteer = require('puppeteer');
const fs = require('fs');
const Crawling = async (brand, brandUrl, productList, product, productTitle, productPrice, productImage, productUrl) => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(`${brandUrl}`);
await page.waitForSelector(productList);
await page.waitFor(5000);
await page.addScriptTag({url: 'https://code.jquery.com/jquery-3.2.1.min.js'});
const result = await page.evaluate(({ product, productTitle, productPrice, productImage, productUrl }) => {
const result= [];
// Problems
$(product).each(function () {
const title = $.trim($(this).find(productTitle).text());
const price = $(this).find(productPrice).text();
const image = $(this).find(productImage).attr('src');
const url = $(this).find(productUrl).attr('href');
result.push({ title, price, image, url })
});
return result;
}, { product, productTitle, productPrice, productImage, productUrl });
if (await write_file(`./db/${brand}.json`, JSON.stringify(result)) === false) {
console.error('Error: Unable to write stores to example.json');
}
await browser.close();
};
const write_file = (file, data) => new Promise((resolve, reject) => {
fs.writeFile(file, data, 'utf8', error => {
if (error) {
console.error(error);
reject(false);
} else {
resolve(true);
}
});
});
module.exports = Crawling;
BrandList.js
const BrandList = {
Imvely : {
brand: "Imvely",
brandUrl: "http://imvely.com/product/list.html?cate_no=72",
productList: "ul.prdList",
product: "ul.prdList > li.item",
title: "p.name span",
price: ".xans-product-listitem > li:eq(1) > span",
image: ".prdImg > a > img",
productUrl: ".prdImg > a"
},
Naning : {
brand: "Naning",
brandUrl: "https://www.naning9.com/shop/listplace.php",
productList: "div.item_list",
product: "div.item_list > div.list_cell",
title: ".item_text > .item_name",
price: ".item_price > p.inblock.dis",
image: "a > img",
productUrl: "a"
}
};
module.exports = BrandList;
我认为这还不够。我不知道到底是什么问题,我想改善我的抓取代码。更加方便和可读。
例如,我想知道当有更多品牌时是否存在问题。
那么,我该如何改善?