提取-下载Excel文件不起作用。档案大小1 kb

时间:2020-04-08 14:23:24

标签: javascript node.js fetch

我正在尝试下载文件,下载后无法使用。我正在获取文件,但大小为1kb,这不是实际的文件大小。

如果使用fetchResp.text(),则无法打开文件名。

这里是完整代码。

我认为问题可能在这里:return await fetchResp.text();

这是示例,设置cookie也很重要,因为我要在登录后下载数据。

如何处理木偶cookie和获取?

如果我将提取函数放在page.evaluation之外怎么办。 {凭证:“ include”}是否有效?

预先感谢您的帮助。

const puppeteer = require("puppeteer");
const cheerio = require("cheerio");
const fs = require("fs");

(async () => {
  const browser = await puppeteer.launch({
    args: ["--no-sandbox"],
    headless: false,
    slowMo: 30,
  });
  const page = await browser.newPage();

  await page.goto(
    "https://file-examples.com/index.php/sample-documents-download/sample-xls-download/"
  );
  const content = await page.content();
  const $ = cheerio.load(content);

  const listings = $("#table-files > tbody > tr:has(a)")
    .map((index, element) => {
      const URL = $(element).find("a").attr("href");

      const Filename = $(element).find("a").attr("href").split("/").pop();
      //.replace(/^.*[\\\/]/g, "");

      const name = $(element)
        .find("td:nth-child(1)")
        .text()
        .trim()
        .replace("\n", "");

      return {
        Filename,
        URL,
      };
    })
    .get();

  for (let val of listings) {
    const downloadUrl = val.URL;
    const Filename = val.Filename;
    console.log(val);

    const downloadedContent = await page.evaluate(async (downloadUrl) => {
      const fetchResp = await fetch(downloadUrl, { credentials: "include" });
      return await fetchResp.text();
    }, downloadUrl);

    fs.writeFile(`./${Filename}`, downloadedContent, () =>
      console.log("Wrote file")
    );
  }

  await page.close();
  await browser.close();
})();

1 个答案:

答案 0 :(得分:0)

这里的主要问题是您将文件内容作为纯文本获取,如果您需要纯文本文件就可以了,但是您需要编写一个excel文件,因此需要blob或arrayBuffer,两者都可以其中无法从page.evaluate方法返回。参见https://github.com/puppeteer/puppeteer/issues/3722

因此,您无需使用puppeteer的page.evaluate函数来获取excel文件,您可以在获取所有链接之后直接使用https模块从节点中获取它们,然后将内容流式传输到文件,在这种情况下更简单,代码也更少。您将需要这些修改

首先需要使用https模块

const https = require('https');

获取链接后,请关闭木偶,因为我们不再需要

.get();
await page.close();
await browser.close();

在遍历链接时在此处调用函数

for (let val of listings) {
 const downloadUrl = val.URL;
 const Filename = val.Filename;
 console.log(val);
 var file = await getFile(downloadUrl, Filename);
}

最后,您需要在主代码块之外创建一个函数来读写文件

function getFile(downloadUrl, Filename) {
    var data = '';
    var writeStream = fs.createWriteStream(Filename);
    var req = https.get(downloadUrl, function(res) {
        res.pipe(writeStream);
        res.on('end', () => {
            console.log('No more data in response.');
        });
    });
    req.end();
}

完整片段

const puppeteer = require('puppeteer');
const cheerio = require("cheerio");
const fs = require("fs");
const https = require('https');

(async () => {
	const browser = await puppeteer.launch({
		args: ["--no-sandbox"],
		headless: false,
		slowMo: 30,
	});
	const page = await browser.newPage();

	await page.goto(
		"https://file-examples.com/index.php/sample-documents-download/sample-xls-download/"
	);
	const content = await page.content();
	const $ = cheerio.load(content);

	const listings = $("#table-files > tbody > tr:has(a)")
		.map((index, element) => {
			const URL = $(element).find("a").attr("href");

			const Filename = $(element).find("a").attr("href").split("/").pop();
			//.replace(/^.*[\\\/]/g, "");

			const name = $(element)
				.find("td:nth-child(1)")
				.text()
				.trim()
				.replace("\n", "");

			return {
				Filename,
				URL,
			};
		})
		.get();
	await page.close();
	await browser.close();

	for (let val of listings) {
		const downloadUrl = val.URL;
		const Filename = val.Filename;
		console.log(val);
    //call the function with each link and filename
		var file = await getFile(downloadUrl, Filename);
	}

})();
//send request and stream the response to a file
function getFile(downloadUrl, Filename) {
	var writeStream = fs.createWriteStream(Filename);
	var req = https.get(downloadUrl, function(res) {
		res.pipe(writeStream);
		res.on('end', () => {
			console.log('No more data in response.');
		});
	});
	req.end();
}

编辑,看到您的评论,您可以通过修改get请求来发送cookie,但是要记住与cookie相同的域策略

function getFile(downloadUrl, Filename) {
 var url = new URL(downloadUrl)
 var options = {
  hostname: url.hostname,
  path: url.pathname,
  method: 'GET',
  headers: {
   'Cookie': 'myCookie=myvalue'
  }
 };
 var writeStream = fs.createWriteStream(Filename);
 var req = https.request(options, function(res) {
  res.pipe(writeStream);
  res.on('end', () => {
   console.log('No more data in response.');
  });
 });
 req.end();
}