[[使用javascript,cheerio,node.js,请求承诺,Visual Studio代码]]
我正在用JS进行网页抓取,并且遇到以下问题...
我编写了以下代码,该代码从页面http://lotrtcgwiki.com/wiki/grand抓取所有表数据,并生成以下形式的json:
[
{
"cardID": "0P1",
"cardTitle": "The Prancing Pony (P)",
"cardType": "Site",
"cardSite": "Site",
"cardUrl": "http://lotrtcgwiki.com/wiki/lotr00001"
},
{
"cardID": "0P2",
"cardTitle": "•Bill the Pony (P)",
"cardType": "Possession",
"cardSite": "Shire",
"cardUrl": "http://lotrtcgwiki.com/wiki/lotr00002"
},
...
现在,在抓取每一行的同时,我想转到cardUrl
链接并从该页面抓取更多数据。这是问题的症结所在。在抓取父页面时如何抓取子页面。
我是JS
和promises
的新手,并且正在使用request-promise
,但最终可能会切换到axios
。
这是我的代码,随时运行它:
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require("fs");
const siteUrl = "http://lotrtcgwiki.com";
const wikiUrl = siteUrl + "/wiki";
const grandListUrl = wikiUrl + "/grand";
const cards = []
const scrapeMainPage = async (url) => {
const result = await axios.get(url);
const $ = cheerio.load(result.data);
$("body > div.dokuwiki > div.page > div > div.wrap_indextab.plugin_wrap > div > table > tbody > tr")
.each(async (index, element) => {
if (index === 0) return true;
const tds = $(element).find("td");
const cardID = $(tds[0]).text().trim();
const cardTitle = $(tds[1]).text().trim();
const cardUrl = wikiUrl + $(tds[1]).children("a").attr("href").replace("/wiki", "").trim();
const cardType = $(tds[2]).text().trim();
const cardSite = $(tds[3]).text().trim();
// Scrape the sub page
scrapeSubPage(cardUrl)
const card = { cardID, cardTitle, cardType, cardSite, cardUrl };
cards.push(card);
});
exportResults(cards, "lotrtcgcards.json")
}
const scrapeSubPage = async (cardUrl) => {
console.log(" Scraping " + cardUrl);
const subPage = await axios.get(cardUrl);
console.log("done")
}
const exportResults = (results, outputFile) => {
try {
fs.writeFile(outputFile, JSON.stringify(results, null, 4), (err) => {
if (err) {
console.log(err);
}
console.log('\n' + results.length + ' Results exported successfully to '+ outputFile);
})
} catch (error) {
throw error;
}
}
scrapeMainPage(grandListUrl);