我正在尝试启动并运行我的第一个Puppeteer脚本。到目前为止,我可以在其上抓取第一页,但单击功能不起作用。
目标是能够擦除不再可见的所有页面(直到“下一个按钮”)。
有关如何操作的任何建议?
const puppeteer = require('puppeteer');
(async () => {
//{headless: false}
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("pages", { waitUntil: 'domcontentloaded' });
// example: get innerHTML of an element
let urls = await page.evaluate(() => {
const prop_list = document.querySelectorAll('.sold-property-listing');
let list = [];
let broker, price, price_kvm, address, kvm, rooms, date_sold, monthly_fee, size, area, type, id, url, bid_change;
for(var i = 0; i < prop_list.length; i++){
var el = prop_list[i];
url = el.querySelector('a').href;
id = "";
broker = el.querySelector('.sold-property-listing__broker').innerText;
price = el.querySelector('div.sold-property-listing__price > div:nth-child(1) > span').innerText;
price_kvm = el.querySelector('div.sold-property-listing__price > div:nth-child(2) > div.sold-property-listing__price-per-m2.sold-property-listing--left').innerText
date_sold = el.querySelector('.sold-property-listing__sold-date.sold-property-listing--left').innerText;
address = el.querySelector('div.sold-property-listing__location > h2 > span.item-result-meta-attribute-is-bold.item-link').innerText;
price = price.replace(/\D/g, '');
price_kvm = price_kvm.replace(/\D/g, '');
date_sold = date_sold.replace('Såld ', '');
monthly_fee = el.querySelector('div.sold-property-listing__size > div > div.sold-property-listing__fee');
size = el.querySelector('div.sold-property-listing__size > div > div.sold-property-listing__subheading.sold-property-listing--left').innerText.split('m²');
area = el.querySelector('div.sold-property-listing__location > div > span.item-link')
type = el.querySelector('div.sold-property-listing__location > div > span.hide-element')
bid_change = document.querySelector('div.sold-property-listing__price-change');
if(bid_change){bid_change = bid_change.innerText.replace(' %', '')}else{bid_change = '0'}
if(area){area = area.innerText.replace(',', '');}else{area = 'Unkown'}
if(url){id = url.split('-');id = id[id.length-1]}else{id = null}
if(type){type = type.innerText;}else{type = 'Unkown'}
if(size[0]){kvm = size[0].replace(/\s/g, '');}else{kvm = null;}
if(size[1]){rooms = size[1].replace(/\D/g, '');}else{rooms = null;}
if(monthly_fee){monthly_fee = monthly_fee.innerText.replace(/\D/g, '');}else{monthly_fee = null}
list.push({'id': id, 'address': address, 'broker': broker, 'price': price, 'price_kvm': price_kvm, 'sold_date': date_sold, 'kvm': kvm, 'rooms': rooms, 'area': area, 'rooms': rooms, 'monthly_fee': monthly_fee, 'bid_change': bid_change})
}
return list;
})
console.log(urls);
// Use Promise.all to wait for two actions (navigation and click)
await Promise.all([
page.waitForNavigation(), // wait for navigation to happen
page.click('a.next_page'), // click link to cause navigation
]);
// close brower when we are done
await browser.close();
})();
答案 0 :(得分:0)
想出一个运行良好的解决方案:
const puppeteer = require('puppeteer');
console.log('into web_scraper')
const webscraping = async pageAmount => {
console.log('into webscraping')
return new Promise(async (resolve, reject) => {
try {
if (!pageAmount) {
pageAmount = 1;
}
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("https://www.hemnet.se/salda/bostader?location_ids%5B%5D=898741");
let currentPage = 1;
let dataObj = [];
while (currentPage <= pageAmount) {
const results = await page.evaluate(() => {
const prop_list = document.querySelectorAll('.sold-property-listing');
let list = [];
let broker, price, price_kvm, address, kvm, rooms, date_sold, monthly_fee, size, area, type, id, url, bid_change;
for(var i = 0; i < prop_list.length; i++) {
var el = prop_list[i];
url = el.querySelector('a').href;
id = ""
broker = el.querySelector('.sold-property-listing__broker').innerText;
price = el.querySelector('div.sold-property-listing__price > div:nth-child(1) > span').innerText;
price_kvm = el.querySelector('div.sold-property-listing__price > div:nth-child(2) > div.sold-property-listing__price-per-m2.sold-property-listing--left').innerText
date_sold = el.querySelector('.sold-property-listing__sold-date.sold-property-listing--left').innerText;
address = el.querySelector('div.sold-property-listing__location > h2 > span.item-result-meta-attribute-is-bold.item-link').innerText;
price = price.replace(/\D/g, '');
price_kvm = price_kvm.replace(/\D/g, '');
date_sold = date_sold.replace('Såld ', '');
monthly_fee = el.querySelector('div.sold-property-listing__size > div > div.sold-property-listing__fee');
size = el.querySelector('div.sold-property-listing__size > div > div.sold-property-listing__subheading.sold-property-listing--left')
.innerText.split('m²');
area = el.querySelector('div.sold-property-listing__location > div > span.item-link')
type = el.querySelector('div.sold-property-listing__location > div > span.hide-element')
bid_change = document.querySelector('div.sold-property-listing__price-change');
url = document.location.href;
if(bid_change){bid_change = bid_change.innerText.replace(' %', '')}else{bid_change = '0'}
if(area){area = area.innerText.replace(',', '');}else{area = 'Unkown'}
if(url){id = url.split('-');id = id[id.length-1]}else{id = null}
if(type){type = type.innerText;}else{type = 'Unkown'}
if(size[0]){kvm = size[0].replace(/\s/g, '');}else{kvm = null;}
if(size[1]){rooms = size[1].replace(/\D/g, '');}else{rooms = null;}
if(monthly_fee){monthly_fee = monthly_fee.innerText.replace(/\D/g, '');}else{monthly_fee = null}
list.push({'url': url, 'id': id, 'address': address, 'broker': broker, 'price': price, 'price_kvm': price_kvm, 'sold_date': date_sold, 'kvm': kvm, 'rooms': rooms, 'area': area, 'rooms': rooms, 'monthly_fee': monthly_fee, 'bid_change': bid_change})
}
return list;
})
dataObj = dataObj.concat(results);
if (currentPage < pageAmount) {
await Promise.all([
await page.waitForSelector('a.next_page'),
await page.click('a.next_page'),
await page.waitForSelector('.sold-property-listing')
])
}
currentPage++;
}
browser.close();
return resolve(dataObj);
} catch (e) {
return reject(e);
}
})
}
module.exports = webscraping;