在没有node.js的客户端上用JavaScript读取非常大的gzip文件

时间:2014-10-21 22:59:13

标签: javascript inflate

我想制作一个类似于Java中的GzipInputStream的javascript例程,在不使用node.js的情况下在客户端读取非常大的gzip文件。

这是我的代码(尚未发布),灵感来自here

function ab2string(buf) {
   var str = "";
   var ab = new Uint16Array(buf);
   var abLen = ab.length;
   var CHUNK_SIZE = Math.pow(2, 16);
   var offset, len, subab;
   for (offset = 0; offset < abLen; offset += CHUNK_SIZE) {
      len = Math.min(CHUNK_SIZE, abLen-offset);
      subab = ab.subarray(offset, offset+len);
      str += String.fromCharCode.apply(null, subab);
   }
   return str;
}
function string2ab(str) {
  var buf = new ArrayBuffer(str.length*2); // 2 bytes for each char
  var bufView = new Uint16Array(buf);
  for (var i=0, strLen=str.length; i<strLen; i++) {
    bufView[i] = str.charCodeAt(i);
  }
  return buf;
}
function FileGzipStreamer() {
    var loopholeReader = new FileReader();
    var chunkReader = new FileReader(); 
    var delimiter = "\n".charCodeAt(0); 

    var expectedChunkSize = 500000; // Slice size to read
    var loopholeSize = 500;         // Slice size to search for line end

    var file = null;
    var fileSize;
    var loopholeStart;
    var loopholeEnd;
    var chunkStart;
    var chunkEnd;
    var allString;
    var lines;
    var thisForClosure = this;
    var handler;
    var fulltext=[];
    var fulltext2=[];
    var fextra=false;
    var fname=false;
    var fcomment=false;
    var fhcrc=false;
    var counter=0;
    var counter2=0;
    var binString=[];


    // Reading of loophole ended
    loopholeReader.onloadend = function(evt) {
        // Read error
        if (evt.target.readyState != FileReader.DONE) {
            handler(null, new Error("Not able to read loophole (start: )"));
            return;
        }
        binString=[];
        binString=evt.target.result.split('').map(function(e){return e.charCodeAt(0);});
        fulltext=fulltext.concat(binString);
        var len=fulltext.length;
        $("#conclusion").append("\n"+"Length="+len+"\n");
        var start=0;
        if (fulltext[0]==31 || fulltext[1]==139) {
            if (fulltext[2]==8) {
                start=10;
                if (Number(fulltext[3]&4)!=4 && Number(fulltext[3]&2)!=2 && Number(fulltext[3]&1)!=1 && Number(fulltext[3]&128)!=128) {
                    if (Number(fulltext[3]&32)==32) {
                        fextra=true;
                    }
                    if (Number(fulltext[3]&16)==16) {
                        fname=true;
                    }
                    if (Number(fulltext[3]&8)==8) {
                        fcomment=true;
                    }
                    if (Number(fulltext[3]&64)==64) {
                        fhcrc=true;
                    }
                }
                else {
                    $("#conclusion").append("Gzip file is invalid");
                }
                start=10
                if (fextra==true) {
                    incrementor=fulltext[start]+256*fulltext[start+1];
                    start+=incrementor+2; // 2 for xlen
                }
                if (fname==true) {
                    start+=1;
                    while(fulltext[start-1]!=0)
                        start+=1
                }
                if (fcomment==true) {
                    start+=1
                    while(fulltext[start-1]!=0)
                        start+=1
                }
                if (fhcrc==true) {
                    start+=2;
                }
                var uncompressed=zip_inflate(ab2string(fulltext.slice(28,len)));
                var splitline=uncompressed.split("\n");
                //$("#conclusion").append(splitline.length+"\n");
                var temp=counter;
                $("#conclusion").append("\n"+"Counter="+counter+", Splitlinelength="+splitline.length+"\n");
                var uncompressed2="";
                //var test=Math.random();
                //$("#conclusion").append(uncompressed);
                for (var i=temp;i<splitline.length-5; i++) {
                    counter+=1;
                    uncompressed2+=splitline[i]+"\n";
                    //if (splitline[i].indexOf("\n")!=-1)
                    //$("#conclusion").append(i+"start"+splitline[i]+"end\n");
                    $("#conclusion").append(splitline[i]);
                    $("#conclusion").append("\n");
                }
                var view = new DataView(string2ab(uncompressed2));
                var realLoopholeSize = loopholeEnd - loopholeStart;
                //$("#conclusion").append("1"+uncompressed+"\n\n\n");
                //$("#conclusion").append(realLoopholeSize+'--'+fulltext.length+'x');
                for(var i = realLoopholeSize - 1; i >= 0; i--) {
                    if (view.getInt8(i) == delimiter) {
                        chunkEnd = loopholeStart + i + 1;
                        var blob = file.slice(chunkStart, chunkEnd);
                        $("#conclusion").append(chunkStart+'xxz'+chunkEnd+'y');
                        chunkReader.readAsBinaryString(blob);
                        return;
                    }
                }

                // No delimiter found, looking in the next loophole
                $("#conclusion").append("test");
                loopholeStart = loopholeEnd;
                loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
                thisForClosure.getNextLine();
                //$("#conclusion").append(zip_inflate(String.fromCharCode.apply(null,fulltext.slice(start,len))));
            }
            else {
                $("#conclusion").append("Unknown compression method!");
            }
        }
        else{
            $("#conclusion").append("Not a gzipped file!");
        }
        //$("#conclusion").append(zip_inflate(String.fromCharCode.apply(null,fulltext)));
        //fulltext=fulltext.concat(arr2);
        //var theText=zip_inflate(String.fromCharCode.apply(null,fulltext.slice(start,len)));
        //$("#conclusion").append("yy"+loopholeEnd+'--'+loopholeStart);
        // No delimiter found, looking in the next loophole
        //loopholeStart = loopholeEnd;
        //loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);

        //thisForClosure.getNextLine();
    };

    // Reading of chunk ended
    chunkReader.onloadend = function(evt) {
        // Read error
        if (evt.target.readyState != FileReader.DONE) {
            handler(null, new Error("Not able to read loophole"));
            return;
        }
        var binString2=evt.target.result.split('').map(function(e){return e.charCodeAt(0);});
        $("#conclusion").append("text2="+binString+"\n");
        fulltext2=fulltext2.concat(binString2);
        var len2=fulltext2.length;
        var start2=0;
        if (fulltext2[0]==31 || fulltext2[1]==139) {
            if (fulltext2[2]==8) {
                start2=10;
                if (Number(fulltext2[3]&4)!=4 && Number(fulltext2[3]&2)!=2 && Number(fulltext2[3]&1)!=1 && Number(fulltext2[3]&128)!=128) {
                    if (Number(fulltext2[3]&32)==32) {
                        fextra=true;
                    }
                    if (Number(fulltext2[3]&16)==16) {
                        fname=true;
                    }
                    if (Number(fulltext2[3]&8)==8) {
                        fcomment=true;
                    }
                    if (Number(fulltext2[3]&64)==64) {
                        fhcrc=true;
                    }
                }   
                else {
                    $("#conclusion").append("Gzip file is invalid");
                }
                if (fextra==true) {
                    incrementor=fulltext2[start2]+256*fulltext2[start2+1];
                    start2+=incrementor+2; // 2 for xlen
                }
                if (fname==true) {
                    start2+=1;
                    while(fulltext2[start2-1]!=0)
                        start2+=1;
                }
                if (fcomment==true) {
                    start2+=1
                    while(fulltext2[start2-1]!=0)
                        start2+=1;
                }
                if (fhcrc==true) {
                    start2+=2;
                }
            }
        }
        //$("#conclusion").append(zip_inflate(String.fromCharCode.apply(null,binString)));
        //binString=binString.concat(arr2);
        var theText=zip_inflate(ab2string(fulltext2.slice(start2,len2)));
        //var temp=counter;
        //var splitline2=theText.split(/\r?\n/);
        //var uncompressed3="";
        //var test=Math.random();
        //for (var i=0;i<splitline2.length; i++) {
            //uncompressed3+=splitline2[i]+"\n";
            //$("#conclusion").append(splitline2[i]);
        //}

        //$("#conclusion").append("3"+theText+"\n\n\n");
        // Remove last new line in the end of chunk
        if (lines.length > 0 && lines[lines.length - 1] == "") {
            lines.pop();
        }
        var temp=0;
        var lines = theText.split(/\r?\n/);
        for (var i=temp;i<lines.length; i++) {
            //counter+=1;
            //uncompressed2+=splitline[i]+"\n";
            //if (splitline[i].indexOf("\n")!=-1)
            //$("#conclusion").append(i+"start"+splitline[i]+"end\n");
            $("#conclusion").append(lines[i]);
            $("#conclusion").append("\n");
        }
        chunkStart = chunkEnd;
        chunkEnd = Math.min(chunkStart, fileSize);
        loopholeStart = Math.min(chunkEnd, fileSize);
        loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize); 
        thisForClosure.getNextLine();
    };


    // Public: open file for reading
    this.open = function (fileToOpen, linesProcessed) {
        file = fileToOpen;
        fileSize = file.size;
        loopholeStart = 0;
        loopholeEnd = Math.min(loopholeStart + loopholeSize, fileSize);
        chunkStart = 0;
        chunkEnd = 0;
        lines = null;
        handler = linesProcessed;
    };

    // Public: start getting new line async
    this.getNextLine = function() {
        // File wasn't open
        if (file == null) {     
            handler(null, new Error("You must open a file first"));
            return;
        }
        // Some lines available
        if (lines != null) {
            var linesForClosure = lines;
            setTimeout(function() { handler(linesForClosure, null) }, 0);
            lines = null;
            return;
        }
        // End of File
        if (chunkStart == fileSize) {
            handler(null, null);
            return;
        }
        // File part bigger than expectedChunkSize is left
        if (loopholeStart < fileSize) {
            var blob = file.slice(loopholeStart, loopholeEnd);
            loopholeReader.readAsBinaryString(blob);
        }
        // All file can be read at once
        else {
            chunkEnd = fileSize;
            var blob = file.slice(chunkStart, fileSize);
            chunkReader.readAsBinaryString(blob);
        }
    };
};

算法here看起来很简单:跳过标题,并在压缩块上调用像this这样的inflate()例程。但是由于gzip文件非常大(几十或几百GB),我需要逐块膨胀压缩块。

有没有办法在不使用Node.js的情况下在JavaScript中对压缩块进行分区并像Java GzipInputStream一样快速膨胀?

1 个答案:

答案 0 :(得分:2)

在节点中,我们可以从文件(fs.createReadableStream())创建可读流并将其传输到zlib.createGunzip()。可读流以块为单位读取数据,然后将其传递给gunzip sink。因此,如果我们将gzip - ed文件放入此设置中;我们将逐块获取提取的数据。

browserify的帮助下,我们可以在浏览器中执行此操作。

e.g。使用此main.js文件

// browserify automatically replaces the node's native zlib with this:
// https://www.npmjs.com/package/browserify-zlib
var zlib = require('zlib');

var drop = require('drag-and-drop-files');
var createReadStream = require('filereader-stream');

var gunzip = zlib.createGunzip();

drop(document.getElementById('drop'), function(files) {
  var first = files[0];
  createReadStream(first).pipe(gunzip);

  gunzip.on('data', function(data){
    // read the data chunk-by-chunk
    console.log(data.toString());
  });

  gunzip.on('end', function(){
    console.log('done');
  });
});

要使其在浏览器中运行,我们应该将browserify魅力放在它上面。

$ browserify app.js > bundle.js

然后,我们可以在其中加载index.html bundle.js(不要忘记drop - 区域)。

我在这个repo上快速整理了poc(流媒体部分,也许我们需要使用webworker API来处理一个非常大的文件)。

poc