我是网络抓取的新手,并且想使用puppeteer在网页上下载所有图像:
const puppeteer = require('puppeteer');
let scrape = async () => {
// Actual Scraping goes Here...
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto('https://memeculture69.tumblr.com/');
// Right click and save images
};
scrape().then((value) => {
console.log(value); // Success!
});
我看过API docs,但不知道如何实现。因此,感谢您的帮助。
答案 0 :(得分:4)
我认为逻辑很简单。您只需要创建一个函数即可获取图像的url并将其保存到目录中。人偶只会抓取图片网址,并将其传递给下载程序功能。这是一个示例:
Server is running at...
答案 1 :(得分:4)
您可以使用以下内容刮取页面上所有图像的所有src
属性的数组:
const images = await page.evaluate( () => Array.from( document.images, e => e.src ) );
然后,您可以使用Node File System Module和HTTP或HTTPS Module下载每张图像。
完整示例:
'use strict';
const fs = require( 'fs' );
const https = require( 'https' );
const puppeteer = require( 'puppeteer' );
/* ============================================================
Promise-Based Download Function
============================================================ */
const download = ( url, destination ) => new Promise( ( resolve, reject ) =>
{
const file = fs.createWriteStream( destination );
https.get( url, response =>
{
response.pipe( file );
file.on( 'finish', () =>
{
file.close( resolve( true ) );
});
})
.on( 'error', error =>
{
fs.unlink( destination );
reject( error.message );
});
});
/* ============================================================
Download All Images
============================================================ */
( async () =>
{
const browser = await puppeteer.launch();
const page = await browser.newPage();
let result;
await page.goto( 'https://www.example.com/' );
const images = await page.evaluate( () => Array.from( document.images, e => e.src ) );
for ( let i = 0; i < images.length; i++ )
{
result = await download( images[i], `image-${i}.png` );
if ( result === true )
{
console.log( 'Success:', images[i], 'has been downloaded successfully.' );
}
else
{
console.log( 'Error:', images[i], 'was not downloaded.' );
console.error( result );
}
}
await browser.close();
})();
答案 2 :(得分:3)
这是另一个例子。它会在Google中进行常规搜索,然后下载左上方的Google图片。
const puppeteer = require('puppeteer');
const fs = require('fs');
async function run() {
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.setViewport({ width: 1200, height: 1200 });
await page.goto('https://www.google.com/search?q=.net+core&rlz=1C1GGRV_enUS785US785&oq=.net+core&aqs=chrome..69i57j69i60l3j69i65j69i60.999j0j7&sourceid=chrome&ie=UTF-8');
const IMAGE_SELECTOR = '#tsf > div:nth-child(2) > div > div.logo > a > img';
let imageHref = await page.evaluate((sel) => {
return document.querySelector(sel).getAttribute('src').replace('/', '');
}, IMAGE_SELECTOR);
console.log("https://www.google.com/" + imageHref);
var viewSource = await page.goto("https://www.google.com/" + imageHref);
fs.writeFile(".googles-20th-birthday-us-5142672481189888-s.png", await viewSource.buffer(), function (err) {
if (err) {
return console.log(err);
}
console.log("The file was saved!");
});
browser.close();
}
run();
如果您有要下载的图像列表,则可以将选择器更改为根据需要进行编程更改,然后向下浏览图像列表,一次下载一个。
答案 3 :(得分:3)
如果要跳过手动的dom遍历,可以直接从页面响应中将图像写入磁盘。
示例:
const puppeteer = require('puppeteer');
const fs = require('fs');
const path = require('path');
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('response', async response => {
const url = response.url();
if (response.request().resourceType() === 'image') {
response.buffer().then(file => {
const fileName = url.split('/').pop();
const filePath = path.resolve(__dirname, fileName);
const writeStream = fs.createWriteStream(filePath);
writeStream.write(file);
});
}
});
await page.goto('https://memeculture69.tumblr.com/');
await browser.close();
})();
答案 4 :(得分:2)
可以获取所有图像而无需独立访问每个URL。您需要侦听对服务器的所有请求:
await page.setRequestInterception(true)
await page.on('request', function (request) {
request.continue()
})
await page.on('response', async function (response) {
// Filter those responses that are interesting
const data = await response.buffer()
// data contains the img information
})
答案 5 :(得分:0)
page.on('response', async (response) => {
const matches = /.*\.(jpg|png|svg|gif)$/.exec(response.url());
if (matches && (matches.length === 2)) {
const extension = matches[1];
const buffer = await response.buffer();
fs.writeFileSync(`images/image-${counter}.${extension}`, buffer, 'base64');
}
});
答案 6 :(得分:0)
对于通过其选择器下载图像,我执行了以下操作:
已将 uri 传递给下载功能
const puppeteer = require('puppeteer');
const fs = require('fs');
var request = require('request');
//download function
var download = function (uri, filename, callback) {
request.head(uri, function (err, res, body) {
console.log('content-type:', res.headers['content-type']);
console.log('content-length:', res.headers['content-length']);
request(uri).pipe(fs.createWriteStream(filename)).on('close', callback);
});
};
(async () => {
const browser = await puppeteer.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox'], //for no sandbox
});
const page = await browser.newPage();
await page.goto('http://example.com');// your url here
let imageLink = await page.evaluate(() => {
const image = document.querySelector('#imageId');
return image.src;
})
await download(imageLink, 'myImage.png', function () {
console.log('done');
});
...
})();
答案 7 :(得分:-1)
仔细阅读下面提到的博客。在此博客中,您将学习使用nodeJS和异步编程在这些无头浏览器上抓取网站。在开始抓取网站之前,让我们更详细地了解无头浏览器。此外,如果您担心刮刮的合法性,则可以清除有关刮刮网络的神话。 Web Scraping with a Headless Browser: A Puppeteer Tutorial https://blog.datahut.co/web-scraping-headless-browser-puppeteer/