抓取主页面表时抓取子页面[JS,NodeJS,Cheerio,请求-承诺]

时间:2020-04-11 20:41:54

标签: javascript node.js web-scraping cheerio request-promise

[[使用javascript,cheerio,node.js,请求承诺,Visual Studio代码]]

我正在用JS进行网页抓取,并且遇到以下问题...

我编写了以下代码,该代码从页面http://lotrtcgwiki.com/wiki/grand抓取所有表数据,并生成以下形式的json:

[
    {
        "cardID": "0P1",
        "cardTitle": "The Prancing Pony (P)",
        "cardType": "Site",
        "cardSite": "Site",
        "cardUrl": "http://lotrtcgwiki.com/wiki/lotr00001"
    },
    {
        "cardID": "0P2",
        "cardTitle": "•Bill the Pony (P)",
        "cardType": "Possession",
        "cardSite": "Shire",
        "cardUrl": "http://lotrtcgwiki.com/wiki/lotr00002"
    },
    ...

现在,在抓取每一行的同时,我想转到cardUrl链接并从该页面抓取更多数据。这是问题的症结所在。在抓取父页面时如何抓取子页面。

我是JSpromises的新手,并且正在使用request-promise,但最终可能会切换到axios

这是我的代码,随时运行它:

const axios = require('axios');
const cheerio = require('cheerio');
const fs = require("fs");

const siteUrl = "http://lotrtcgwiki.com";
const wikiUrl = siteUrl + "/wiki";
const grandListUrl = wikiUrl + "/grand";
const cards = []

const scrapeMainPage = async (url) => {

    const result = await axios.get(url);
    const $ = cheerio.load(result.data);

    $("body > div.dokuwiki > div.page > div > div.wrap_indextab.plugin_wrap > div > table > tbody > tr")
        .each(async (index, element) => {

            if (index === 0) return true;

            const tds = $(element).find("td");
            const cardID = $(tds[0]).text().trim();
            const cardTitle = $(tds[1]).text().trim();
            const cardUrl = wikiUrl + $(tds[1]).children("a").attr("href").replace("/wiki", "").trim();
            const cardType = $(tds[2]).text().trim();
            const cardSite = $(tds[3]).text().trim();

            // Scrape the sub page
            scrapeSubPage(cardUrl)

            const card = { cardID, cardTitle, cardType, cardSite, cardUrl };
            cards.push(card);   

        });

    exportResults(cards, "lotrtcgcards.json")
}

const scrapeSubPage = async (cardUrl) => {
    console.log("  Scraping " + cardUrl);
    const subPage = await axios.get(cardUrl);
    console.log("done")
}

const exportResults = (results, outputFile) => {

    try {
        fs.writeFile(outputFile, JSON.stringify(results, null, 4), (err) => {
            if (err) {
                console.log(err);
            }
            console.log('\n' + results.length + ' Results exported successfully to '+ outputFile);
        })
    } catch (error) {
        throw error;
    }
}

scrapeMainPage(grandListUrl);

0 个答案:

没有答案