我的代码使用数据抓取从该房地产网站(https://www.zillow.com/vancouver-bc/)中提取信息。现在,对于图像,我总是会在输入几次后得到一些垃圾数据。我尝试滚动到页面底部,然后截屏以查看是否所有延迟加载的图像都已实际加载。屏幕截图显示,所有图像在滚动后都已加载,但是我收到的数据仍然没有图像数据。我确定有一些代码错误,但是我找不到它。
let cheerio = require('cheerio')
let puppeteer = require('puppeteer-extra')
const pluginStealth = require("puppeteer-extra-plugin-stealth")
puppeteer.use(pluginStealth())
let userAgent = require('random-useragent')
const baseURL = "https://www.zillow.com/vancouver-bc"
let estateData = []
let urlLinks = []
let getEstateData = async () => {
estateData = []
urlLinks = []
let url
for (let pgNum = 1; pgNum <= 1; pgNum++) {
if (pgNum === 1) {
url = baseURL + "/"
} else {
url = baseURL + ("/" + pgNum + "_p")
}
urlLinks.push(url)
}
await searchWebsite()
console.log("search over")
return estateData
//module.exports = estateData
}
function scrollPage(page) {
return page.evaluate( () => {
// Page evaluate's scope is the page.
// You have to pass args as a second parameter to evalute, but functions come up undefined.
// https://stackoverflow.com/questions/46088351/puppeteer-pass-variable-in-evaluate
let interval;
let scrollTop = 0;
const scrollBottom = 20;
const intervalRate = 50;
const pageDocument = document.documentElement;
// this needs to hang until the interval clears
// or the script will just move on to the waitFor below.
return new Promise(resolve => {
function scroll() {
if (scrollTop + scrollBottom < pageDocument.scrollHeight) {
scrollTop += scrollBottom;
window.scroll(0, scrollTop);
} else {
clearInterval(interval);
resolve(window.data);
}
}
interval = setInterval(scroll, intervalRate);
});
});
}
let searchWebsite = async () => {
await puppeteer
.launch({headless : false})
.then(async function (browser) {
let page = await browser.newPage();
// await page.setRequestInterception(true)
//
// page.on('request', (req) => {
// if( req.resourceType() === 'image' || req.resourceType() === 'stylesheet' || req.resourceType() === 'font'){
// req.abort()
// }
// else {
// req.continue()
// }
//
// })
await page.setViewport({ width: 1001, height: 1001 });
let html
await page.setUserAgent(userAgent.getRandom())
for(let url of urlLinks){
console.log(url)
await page.goto(url).then(async function () {
html = await page.content();
let obj = await cheerio('.list-card-link.list-card-info', html)
let imgObj = await cheerio(".list-card-top", html)
let geoLocation = await cheerio(".photo-cards.photo-cards_wow", html)
await scrollPage(page)
.then(async () => {
await page.screenshot({path: 'testScreenShot.png', fullPage: true});
})
.then(async () => {
let num = 0
console.log(obj.length, "scrapping")
for (let key in obj) {
if (obj[key].attribs) {
try {
let geoStr = await geoLocation[0].children[0].children[0].children[0].data
let geoObj = await (JSON.parse(geoStr)["geo"])
let extractedInfo = {
estateName: await obj[key].children[0].children[0].data,
estatePrice: await obj[key].children[2].children[0].children[0].data,
saleType: await obj[key].children[1].children[0].next.data,
estateConfig: {
beds: await obj[key].children[2].children[1].children[0].children[0].data,
bath: await obj[key].children[2].children[1].children[1].children[0].data,
area: await obj[key].children[2].children[1].children[2].children[0].data
},
estateLocation: {
longitude: await geoObj.longitude,
latitude: await geoObj.latitude
},
estateLink: await obj[key].attribs.href,
estateCoverImgLink: await imgObj[num].children[2].children[0].attribs.src
}
console.log(extractedInfo.estateName, extractedInfo.estateCoverImgLink)
//console.log(geoLocation[0].children[0].children[0])
await estateData.push(extractedInfo)
num++
} catch (e) {
console.log("Estate Skipped - ", obj[key].children[0].children[0].data, obj[key].attribs.href)
}
}
}
})
})
console.log(estateData.length)
}
//Now read the page
console.log("total - ", estateData.length)
await page.close()
await browser.close()
})
.catch(function (err) {
console.log(err)
});
}
module.exports.getEstateData = getEstateData