目标:将大型文件上传到AWS Glacier,而无需将整个文件保存在内存中。
我目前正在使用fs.readFileSync()上传到冰川,但事情正在发挥作用。但是,我需要处理大于4GB的文件,并且我想并行上传多个块。这意味着转向分段上传。 我可以选择块大小但是冰川需要每个块大小相同(除了最后一个)
This线程表明我可以在读取流上设置块大小,但我实际上并不能保证得到它。
有关如何在不将整个文件读入内存并手动拆分的情况下获取一致部分的任何信息?
假设我可以达到这一点,我只会使用集群,其中有一些进程可以像上传到AWS一样快速地关闭流。 如果这似乎是错误的工作并行化方式,我会喜欢那里的建议。
答案 0 :(得分:11)
如果没有别的,您可以手动使用fs.open()
,fs.read()
和fs.close()
。例如:
var CHUNK_SIZE = 10 * 1024 * 1024, // 10MB
buffer = new Buffer(CHUNK_SIZE),
filePath = '/tmp/foo';
fs.open(filePath, 'r', function(err, fd) {
if (err) throw err;
function readNextChunk() {
fs.read(fd, buffer, 0, CHUNK_SIZE, null, function(err, nread) {
if (err) throw err;
if (nread === 0) {
// done reading file, do any necessary finalization steps
fs.close(fd, function(err) {
if (err) throw err;
});
return;
}
var data;
if (nread < CHUNK_SIZE)
data = buffer.slice(0, nread);
else
data = buffer;
// do something with `data`, then call `readNextChunk();`
});
}
readNextChunk();
});
答案 1 :(得分:0)
您可以考虑使用下面的代码片段读取文件,该文件以1024字节为块
var fs = require('fs');
var data = '';
var readStream = fs.createReadStream('/tmp/foo.txt',{ highWaterMark: 1 * 1024, encoding: 'utf8' });
readStream.on('data', function(chunk) {
data += chunk;
console.log('chunk Data : ')
console.log(chunk);// your processing chunk logic will go here
}).on('end', function() {
console.log('###################');
console.log(data);
// here you see all data processed at end of file
});
请注意:highWaterMark是用于块大小的参数 希望这会有所帮助!
网络参考:https://stackabuse.com/read-files-with-node-js/ Changing readstream chunksize
答案 2 :(得分:0)
基于mscdex's answer,这是一个使用同步替代项并带有StringDecoder来正确解析UTF-8的模块
readableStream
的问题在于,要使用它,您必须将整个项目转换为使用异步发射器和回调。如果您正在编写简单的代码(例如在nodejs中使用小型CLI),则没有任何意义。
//usage
let file = new UTF8FileReader()
file.open('./myfile.txt', 1024)
while ( file.isOpen ) {
let stringData=file.readChunk()
console.log(stringData)
}
//--------------------
// UTF8FileReader.ts
//--------------------
import * as fs from 'fs';
import { StringDecoder, NodeStringDecoder } from "string_decoder";
export class UTF8FileReader {
filename: string;
isOpen: boolean = false;
private chunkSize: number;
private fd: number; //file handle from fs.OpenFileSync
private readFilePos: number;
private readBuffer: Buffer;
private utf8decoder: NodeStringDecoder
/**
* open the file | throw
* @param filename
*/
open(filename, chunkSize: number = 16 * 1024) {
this.chunkSize = chunkSize;
try {
this.fd = fs.openSync(filename, 'r');
}
catch (e) {
throw new Error("opening " + filename + ", error:" + e.toString());
}
this.filename = filename;
this.isOpen = true;
this.readBuffer = Buffer.alloc(this.chunkSize);
this.readFilePos = 0;
//a StringDecoder is a buffered object that ensures complete UTF-8 multibyte decoding from a byte buffer
this.utf8decoder = new StringDecoder('utf8')
}
/**
* read another chunk from the file
* return the decoded UTF8 into a string
* (or throw)
* */
readChunk(): string {
let decodedString = '' //return '' by default
if (!this.isOpen) {
return decodedString;
}
let readByteCount: number;
try {
readByteCount = fs.readSync(this.fd, this.readBuffer, 0, this.chunkSize, this.readFilePos);
}
catch (e) {
throw new Error("reading " + this.filename + ", error:" + e.toString());
}
if (readByteCount) {
//some data read, advance readFilePos
this.readFilePos += readByteCount;
//get only the read bytes (if we reached the end of the file)
const onlyReadBytesBuf = this.readBuffer.slice(0, readByteCount);
//correctly decode as utf8, and store in decodedString
//yes, the api is called "write", but it decodes a string - it's a write-decode-and-return the string kind-of-thing :)
decodedString = this.utf8decoder.write(onlyReadBytesBuf);
}
else {
//read returns 0 => all bytes read
this.close();
}
return decodedString
}
close() {
if (!this.isOpen) {
return;
}
fs.closeSync(this.fd);
this.isOpen = false;
this.utf8decoder.end();
}
}
如果没有打字稿,这是.js转换代码:
// UTF8FileReader.js
"use strict";
Object.defineProperty(exports, "__esModule", { value: true });
exports.UTF8FileReader = void 0;
//--------------------
// UTF8FileReader
//--------------------
const fs = require("fs");
const string_decoder_1 = require("string_decoder");
class UTF8FileReader {
constructor() {
this.isOpen = false;
}
/**
* open the file | throw
* @param filename
*/
open(filename, chunkSize = 16 * 1024) {
this.chunkSize = chunkSize;
try {
this.fd = fs.openSync(filename, 'r');
}
catch (e) {
throw new Error("opening " + filename + ", error:" + e.toString());
}
this.filename = filename;
this.isOpen = true;
this.readBuffer = Buffer.alloc(this.chunkSize);
this.readFilePos = 0;
//a StringDecoder is a buffered object that ensures complete UTF-8 multibyte decoding from a byte buffer
this.utf8decoder = new string_decoder_1.StringDecoder('utf8');
}
/**
* read another chunk from the file
* return the decoded UTF8 into a string
* (or throw)
* */
readChunk() {
let decodedString = ''; //return '' by default
if (!this.isOpen) {
return decodedString;
}
let readByteCount;
try {
readByteCount = fs.readSync(this.fd, this.readBuffer, 0, this.chunkSize, this.readFilePos);
}
catch (e) {
throw new Error("reading " + this.filename + ", error:" + e.toString());
}
if (readByteCount) {
//some data read, advance readFilePos
this.readFilePos += readByteCount;
//get only the read bytes (if we reached the end of the file)
const onlyReadBytesBuf = this.readBuffer.slice(0, readByteCount);
//correctly decode as utf8, and store in decodedString
//yes, the api is called "write", but it decodes a string - it's a write-decode-and-return the string kind-of-thing :)
decodedString = this.utf8decoder.write(onlyReadBytesBuf);
}
else {
//read returns 0 => all bytes read
this.close();
}
return decodedString;
}
close() {
if (!this.isOpen) {
return;
}
fs.closeSync(this.fd);
this.isOpen = false;
this.utf8decoder.end();
}
}
exports.UTF8FileReader = UTF8FileReader;