我正在尝试下载文件,下载后无法使用。我正在获取文件,但大小为1kb,这不是实际的文件大小。
如果使用fetchResp.text(),则无法打开文件名。
这里是完整代码。
我认为问题可能在这里:return await fetchResp.text();
这是示例,设置cookie也很重要,因为我要在登录后下载数据。
如何处理木偶cookie和获取?
如果我将提取函数放在page.evaluation之外怎么办。 {凭证:“ include”}是否有效?
预先感谢您的帮助。
const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
const fs = require("fs");
(async () => {
const browser = await puppeteer.launch({
args: ["--no-sandbox"],
headless: false,
slowMo: 30,
});
const page = await browser.newPage();
await page.goto(
"https://file-examples.com/index.php/sample-documents-download/sample-xls-download/"
);
const content = await page.content();
const $ = cheerio.load(content);
const listings = $("#table-files > tbody > tr:has(a)")
.map((index, element) => {
const URL = $(element).find("a").attr("href");
const Filename = $(element).find("a").attr("href").split("/").pop();
//.replace(/^.*[\\\/]/g, "");
const name = $(element)
.find("td:nth-child(1)")
.text()
.trim()
.replace("\n", "");
return {
Filename,
URL,
};
})
.get();
for (let val of listings) {
const downloadUrl = val.URL;
const Filename = val.Filename;
console.log(val);
const downloadedContent = await page.evaluate(async (downloadUrl) => {
const fetchResp = await fetch(downloadUrl, { credentials: "include" });
return await fetchResp.text();
}, downloadUrl);
fs.writeFile(`./${Filename}`, downloadedContent, () =>
console.log("Wrote file")
);
}
await page.close();
await browser.close();
})();
答案 0 :(得分:0)
这里的主要问题是您将文件内容作为纯文本获取,如果您需要纯文本文件就可以了,但是您需要编写一个excel文件,因此需要blob或arrayBuffer,两者都可以其中无法从page.evaluate
方法返回。参见https://github.com/puppeteer/puppeteer/issues/3722
因此,您无需使用puppeteer的page.evaluate
函数来获取excel文件,您可以在获取所有链接之后直接使用https
模块从节点中获取它们,然后将内容流式传输到文件,在这种情况下更简单,代码也更少。您将需要这些修改
首先需要使用https模块
const https = require('https');
获取链接后,请关闭木偶,因为我们不再需要
.get();
await page.close();
await browser.close();
在遍历链接时在此处调用函数
for (let val of listings) {
const downloadUrl = val.URL;
const Filename = val.Filename;
console.log(val);
var file = await getFile(downloadUrl, Filename);
}
最后,您需要在主代码块之外创建一个函数来读写文件
function getFile(downloadUrl, Filename) {
var data = '';
var writeStream = fs.createWriteStream(Filename);
var req = https.get(downloadUrl, function(res) {
res.pipe(writeStream);
res.on('end', () => {
console.log('No more data in response.');
});
});
req.end();
}
完整片段
const puppeteer = require('puppeteer');
const cheerio = require("cheerio");
const fs = require("fs");
const https = require('https');
(async () => {
const browser = await puppeteer.launch({
args: ["--no-sandbox"],
headless: false,
slowMo: 30,
});
const page = await browser.newPage();
await page.goto(
"https://file-examples.com/index.php/sample-documents-download/sample-xls-download/"
);
const content = await page.content();
const $ = cheerio.load(content);
const listings = $("#table-files > tbody > tr:has(a)")
.map((index, element) => {
const URL = $(element).find("a").attr("href");
const Filename = $(element).find("a").attr("href").split("/").pop();
//.replace(/^.*[\\\/]/g, "");
const name = $(element)
.find("td:nth-child(1)")
.text()
.trim()
.replace("\n", "");
return {
Filename,
URL,
};
})
.get();
await page.close();
await browser.close();
for (let val of listings) {
const downloadUrl = val.URL;
const Filename = val.Filename;
console.log(val);
//call the function with each link and filename
var file = await getFile(downloadUrl, Filename);
}
})();
//send request and stream the response to a file
function getFile(downloadUrl, Filename) {
var writeStream = fs.createWriteStream(Filename);
var req = https.get(downloadUrl, function(res) {
res.pipe(writeStream);
res.on('end', () => {
console.log('No more data in response.');
});
});
req.end();
}
编辑,看到您的评论,您可以通过修改get请求来发送cookie,但是要记住与cookie相同的域策略
function getFile(downloadUrl, Filename) {
var url = new URL(downloadUrl)
var options = {
hostname: url.hostname,
path: url.pathname,
method: 'GET',
headers: {
'Cookie': 'myCookie=myvalue'
}
};
var writeStream = fs.createWriteStream(Filename);
var req = https.request(options, function(res) {
res.pipe(writeStream);
res.on('end', () => {
console.log('No more data in response.');
});
});
req.end();
}