因此,我试图仅从两个以上的网站(这里为PS Store)中删除两个元素。另外,我正在尝试以最简单的方式实现它。由于我是JS的新手,因此请保持柔和;)在我的脚本下方。我试图通过for循环来实现它,但是没有任何效果(仍然只有阵列中的第一个网站)。非常感谢您提供的任何帮助。
const puppeteer = require("puppeteer");
async function scrapeProduct(url) {
const urls = [
"https://store.playstation.com/pl-pl/product/EP9000-CUSA09176_00-DAYSGONECOMPLETE",
"https://store.playstation.com/pl-pl/product/EP9000-CUSA13323_00-GHOSTSHIP0000000",
];
const browser = await puppeteer.launch();
const page = await browser.newPage();
for (i = 0; i < urls.length; i++) {
const url = urls[i];
const promise = page.waitForNavigation({ waitUntil: "networkidle" });
await page.goto(`${url}`);
await promise;
}
const [el] = await page.$x(
"/html/body/div[3]/div/div/div[2]/div/div/div[2]/h2"
);
const txt = await el.getProperty("textContent");
const title = await txt.jsonValue();
const [el2] = await page.$x(
"/html/body/div[3]/div/div/div[2]/div/div/div[1]/div[2]/div[1]/div[1]/h3"
);
const txt2 = await el2.getProperty("textContent");
const price = await txt2.jsonValue();
console.log({ title, price });
browser.close();
}
scrapeProduct();
答案 0 :(得分:0)
通常,您的代码还可以。不过,应该纠正的事情很少:
const puppeteer = require("puppeteer");
async function scrapeProduct(url) {
const urls = [
"https://store.playstation.com/pl-pl/product/EP9000-CUSA09176_00-DAYSGONECOMPLETE",
"https://store.playstation.com/pl-pl/product/EP9000-CUSA13323_00-GHOSTSHIP0000000",
];
const browser = await puppeteer.launch({
headless: false
});
for (i = 0; i < urls.length; i++) {
const page = await browser.newPage();
const url = urls[i];
const promise = page.waitForNavigation({
waitUntil: "networkidle2"
});
await page.goto(`${url}`);
await promise;
const [el] = await page.$x(
"/html/body/div[3]/div/div/div[2]/div/div/div[2]/h2"
);
const txt = await el.getProperty("textContent");
const title = await txt.jsonValue();
const [el2] = await page.$x(
"/html/body/div[3]/div/div/div[2]/div/div/div[1]/div[2]/div[1]/div[1]/h3"
);
const txt2 = await el2.getProperty("textContent");
const price = await txt2.jsonValue();
console.log({
title,
price
});
}
browser.close();
}
scrapeProduct();
{ headless: false }
。这使您可以查看浏览器中实际发生的情况。networkidle
最新版本中的事件npm
。您应该改用networkidle0
或networkidle2
。html/body/div...
寻找元素。这可能是主观的,但是我认为标准的JS / CSS选择器更具可读性:body > div ...
。但是,如果它可行... 上面的代码会产生以下结果:
{ title: 'Days Gone™', price: '289,00 zl' }
{ title: 'Ghost of Tsushima', price: '289,00 zl' }