我想制作一个类似于Java中的GzipInputStream的javascript例程,在不使用node.js的情况下在客户端读取非常大的gzip文件。
这是我的代码(尚未发布),灵感来自here:
function ab2string(buf) {
var str = "";
var ab = new Uint16Array(buf);
var abLen = ab.length;
var CHUNK_SIZE = Math.pow(2, 16);
var offset, len, subab;
for (offset = 0; offset < abLen; offset += CHUNK_SIZE) {
len = Math.min(CHUNK_SIZE, abLen-offset);
subab = ab.subarray(offset, offset+len);
str += String.fromCharCode.apply(null, subab);
}
return str;
}
function string2ab(str) {
var buf = new ArrayBuffer(str.length*2); // 2 bytes for each char
var bufView = new Uint16Array(buf);
for (var i=0, strLen=str.length; i<strLen; i++) {
bufView[i] = str.charCodeAt(i);
}
return buf;
}
function FileGzipStreamer() {
var loopholeReader = new FileReader();
var chunkReader = new FileReader();
var delimiter = "\n".charCodeAt(0);
var expectedChunkSize = 500000; // Slice size to read
var loopholeSize = 500; // Slice size to search for line end
var file = null;
var fileSize;
var loopholeStart;
var loopholeEnd;
var chunkStart;
var chunkEnd;
var allString;
var lines;
var thisForClosure = this;
var handler;
var fulltext=[];
var fulltext2=[];
var fextra=false;
var fname=false;
var fcomment=false;
var fhcrc=false;
var counter=0;
var counter2=0;
var binString=[];
// Reading of loophole ended
loopholeReader.onloadend = function(evt) {
// Read error
if (evt.target.readyState != FileReader.DONE) {
handler(null, new Error("Not able to read loophole (start: )"));
return;
}
binString=[];
binString=evt.target.result.split('').map(function(e){return e.charCodeAt(0);});
fulltext=fulltext.concat(binString);
var len=fulltext.length;
$("#conclusion").append("\n"+"Length="+len+"\n");
var start=0;
if (fulltext[0]==31 || fulltext[1]==139) {
if (fulltext[2]==8) {
start=10;
if (Number(fulltext[3]&4)!=4 && Number(fulltext[3]&2)!=2 && Number(fulltext[3]&1)!=1 && Number(fulltext[3]&128)!=128) {
if (Number(fulltext[3]&32)==32) {
fextra=true;
}
if (Number(fulltext[3]&16)==16) {
fname=true;
}
if (Number(fulltext[3]&8)==8) {
fcomment=true;
}
if (Number(fulltext[3]&64)==64) {
fhcrc=true;
}
}
else {
$("#conclusion").append("Gzip file is invalid");
}
start=10
if (fextra==true) {
incrementor=fulltext[start]+256*fulltext[start+1];
start+=incrementor+2; // 2 for xlen
}
if (fname==true) {
start+=1;
while(fulltext[start-1]!=0)
start+=1
}
if (fcomment==true) {
start+=1
while(fulltext[start-1]!=0)
start+=1
}
if (fhcrc==true) {
start+=2;
}
var uncompressed=zip_inflate(ab2string(fulltext.slice(28,len)));
var splitline=uncompressed.split("\n");
//$("#conclusion").append(splitline.length+"\n");
var temp=counter;
$("#conclusion").append("\n"+"Counter="+counter+", Splitlinelength="+splitline.length+"\n");
var uncompressed2="";
//var test=Math.random();
//$("#conclusion").append(uncompressed);
for (var i=temp;i<splitline.length-5; i++) {
counter+=1;
uncompressed2+=splitline[i]+"\n";
//if (splitline[i].indexOf("\n")!=-1)
//$("#conclusion").append(i+"start"+splitline[i]+"end\n");
$("#conclusion").append(splitline[i]);
$("#conclusion").append("\n");
}
var view = new DataView(string2ab(uncompressed2));
var realLoopholeSize = loopholeEnd - loopholeStart;
//$("#conclusion").append("1"+uncompressed+"\n\n\n");
//$("#conclusion").append(realLoopholeSize+'--'+fulltext.length+'x');
for(var i = realLoopholeSize - 1; i >= 0; i--) {
if (view.getInt8(i) == delimiter) {
chunkEnd = loopholeStart + i + 1;
var blob = file.slice(chunkStart, chunkEnd);
$("#conclusion").append(chunkStart+'xxz'+chunkEnd+'y');
chunkReader.readAsBinaryString(blob);
return;
}
}
// No delimiter found, looking in the next loophole
$("#conclusion").append("test");
loopholeStart = loopholeEnd;
loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
thisForClosure.getNextLine();
//$("#conclusion").append(zip_inflate(String.fromCharCode.apply(null,fulltext.slice(start,len))));
}
else {
$("#conclusion").append("Unknown compression method!");
}
}
else{
$("#conclusion").append("Not a gzipped file!");
}
//$("#conclusion").append(zip_inflate(String.fromCharCode.apply(null,fulltext)));
//fulltext=fulltext.concat(arr2);
//var theText=zip_inflate(String.fromCharCode.apply(null,fulltext.slice(start,len)));
//$("#conclusion").append("yy"+loopholeEnd+'--'+loopholeStart);
// No delimiter found, looking in the next loophole
//loopholeStart = loopholeEnd;
//loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
//thisForClosure.getNextLine();
};
// Reading of chunk ended
chunkReader.onloadend = function(evt) {
// Read error
if (evt.target.readyState != FileReader.DONE) {
handler(null, new Error("Not able to read loophole"));
return;
}
var binString2=evt.target.result.split('').map(function(e){return e.charCodeAt(0);});
$("#conclusion").append("text2="+binString+"\n");
fulltext2=fulltext2.concat(binString2);
var len2=fulltext2.length;
var start2=0;
if (fulltext2[0]==31 || fulltext2[1]==139) {
if (fulltext2[2]==8) {
start2=10;
if (Number(fulltext2[3]&4)!=4 && Number(fulltext2[3]&2)!=2 && Number(fulltext2[3]&1)!=1 && Number(fulltext2[3]&128)!=128) {
if (Number(fulltext2[3]&32)==32) {
fextra=true;
}
if (Number(fulltext2[3]&16)==16) {
fname=true;
}
if (Number(fulltext2[3]&8)==8) {
fcomment=true;
}
if (Number(fulltext2[3]&64)==64) {
fhcrc=true;
}
}
else {
$("#conclusion").append("Gzip file is invalid");
}
if (fextra==true) {
incrementor=fulltext2[start2]+256*fulltext2[start2+1];
start2+=incrementor+2; // 2 for xlen
}
if (fname==true) {
start2+=1;
while(fulltext2[start2-1]!=0)
start2+=1;
}
if (fcomment==true) {
start2+=1
while(fulltext2[start2-1]!=0)
start2+=1;
}
if (fhcrc==true) {
start2+=2;
}
}
}
//$("#conclusion").append(zip_inflate(String.fromCharCode.apply(null,binString)));
//binString=binString.concat(arr2);
var theText=zip_inflate(ab2string(fulltext2.slice(start2,len2)));
//var temp=counter;
//var splitline2=theText.split(/\r?\n/);
//var uncompressed3="";
//var test=Math.random();
//for (var i=0;i<splitline2.length; i++) {
//uncompressed3+=splitline2[i]+"\n";
//$("#conclusion").append(splitline2[i]);
//}
//$("#conclusion").append("3"+theText+"\n\n\n");
// Remove last new line in the end of chunk
if (lines.length > 0 && lines[lines.length - 1] == "") {
lines.pop();
}
var temp=0;
var lines = theText.split(/\r?\n/);
for (var i=temp;i<lines.length; i++) {
//counter+=1;
//uncompressed2+=splitline[i]+"\n";
//if (splitline[i].indexOf("\n")!=-1)
//$("#conclusion").append(i+"start"+splitline[i]+"end\n");
$("#conclusion").append(lines[i]);
$("#conclusion").append("\n");
}
chunkStart = chunkEnd;
chunkEnd = Math.min(chunkStart, fileSize);
loopholeStart = Math.min(chunkEnd, fileSize);
loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
thisForClosure.getNextLine();
};
// Public: open file for reading
this.open = function (fileToOpen, linesProcessed) {
file = fileToOpen;
fileSize = file.size;
loopholeStart = 0;
loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
chunkStart = 0;
chunkEnd = 0;
lines = null;
handler = linesProcessed;
};
// Public: start getting new line async
this.getNextLine = function() {
// File wasn't open
if (file == null) {
handler(null, new Error("You must open a file first"));
return;
}
// Some lines available
if (lines != null) {
var linesForClosure = lines;
setTimeout(function() { handler(linesForClosure, null) }, 0);
lines = null;
return;
}
// End of File
if (chunkStart == fileSize) {
handler(null, null);
return;
}
// File part bigger than expectedChunkSize is left
if (loopholeStart < fileSize) {
var blob = file.slice(loopholeStart, loopholeEnd);
loopholeReader.readAsBinaryString(blob);
}
// All file can be read at once
else {
chunkEnd = fileSize;
var blob = file.slice(chunkStart, fileSize);
chunkReader.readAsBinaryString(blob);
}
};
};
算法here看起来很简单:跳过标题,并在压缩块上调用像this这样的inflate()例程。但是由于gzip文件非常大(几十或几百GB),我需要逐块膨胀压缩块。
有没有办法在不使用Node.js的情况下在JavaScript中对压缩块进行分区并像Java GzipInputStream一样快速膨胀?
答案 0 :(得分:2)
在节点中,我们可以从文件(fs.createReadableStream()
)创建可读流并将其传输到zlib.createGunzip()
。可读流以块为单位读取数据,然后将其传递给gunzip sink
。因此,如果我们将gzip
- ed文件放入此设置中;我们将逐块获取提取的数据。
在browserify的帮助下,我们可以在浏览器中执行此操作。
e.g。使用此main.js
文件
// browserify automatically replaces the node's native zlib with this:
// https://www.npmjs.com/package/browserify-zlib
var zlib = require('zlib');
var drop = require('drag-and-drop-files');
var createReadStream = require('filereader-stream');
var gunzip = zlib.createGunzip();
drop(document.getElementById('drop'), function(files) {
var first = files[0];
createReadStream(first).pipe(gunzip);
gunzip.on('data', function(data){
// read the data chunk-by-chunk
console.log(data.toString());
});
gunzip.on('end', function(){
console.log('done');
});
});
要使其在浏览器中运行,我们应该将browserify魅力放在它上面。
$ browserify app.js > bundle.js
然后,我们可以在其中加载index.html
bundle.js
(不要忘记drop
- 区域)。
我在这个repo上快速整理了poc(流媒体部分,也许我们需要使用webworker API来处理一个非常大的文件)。