我拥有的最小文件> 850k行和每行的长度未知。目标是在浏览器中从此文件中读取n
行。完全阅读它不会发生。
这是HTML <input type="file" name="file" id="file">
和我拥有的JS:
var n = 10;
var reader = new FileReader();
reader.onload = function(progressEvent) {
// Entire file
console.log(this.result);
// By lines
var lines = this.result.split('\n');
for (var line = 0; line < n; line++) {
console.log(lines[line]);
}
};
显然,这里的问题是它试图首先实现整个文件,然后用换行符拆分它。因此,无论n
如何,它都会尝试来读取整个文件,并在文件较大时最终不读取任何内容。
我该怎么做?
注意:我愿意删除整个功能并从头开始,因为我能够console.log()
我们读过的每一行。
*&#34;每一行都是未知的长度&#34; - &GT;意味着该文件是这样的:
(0, (1, 2))
(1, (4, 5, 6))
(2, (7))
(3, (8))
编辑:
可行的方法类似于filereader api on big files,但我无法看到如何修改该内容以阅读文件的n
行...
通过使用Uint8Array to string in Javascript,可以从那里做到:
var view = new Uint8Array(fr.result);
var string = new TextDecoder("utf-8").decode(view);
console.log("Chunk " + string);
但这可能无法读取整个最后一行,那么您将如何确定以后的行?例如,这是它打印的内容:
((7202), (u'11330875493', u'2554375661'))
((1667), (u'9079074735', u'6883914476',
答案 0 :(得分:12)
逻辑与我在filereader api on big files的回答中写的非常相似,除了你需要跟踪到目前为止已处理的行数(以及到目前为止读取的最后一行,因为它可能还没有结束)。下一个示例适用于与UTF-8兼容的任何编码;如果您需要其他编码,请查看TextDecoder
构造函数的选项。
如果您确定输入是ASCII(或任何其他单字节编码),那么您也可以跳过TextDecoder
的使用,并使用FileReader
's readAsText
method直接将输入作为文本读取。
// This is just an example of the function below.
document.getElementById('start').onclick = function() {
var file = document.getElementById('infile').files[0];
if (!file) {
console.log('No file selected.');
return;
}
var maxlines = parseInt(document.getElementById('maxlines').value, 10);
var lineno = 1;
// readSomeLines is defined below.
readSomeLines(file, maxlines, function(line) {
console.log("Line: " + (lineno++) + line);
}, function onComplete() {
console.log('Read all lines');
});
};
/**
* Read up to and including |maxlines| lines from |file|.
*
* @param {Blob} file - The file to be read.
* @param {integer} maxlines - The maximum number of lines to read.
* @param {function(string)} forEachLine - Called for each line.
* @param {function(error)} onComplete - Called when the end of the file
* is reached or when |maxlines| lines have been read.
*/
function readSomeLines(file, maxlines, forEachLine, onComplete) {
var CHUNK_SIZE = 50000; // 50kb, arbitrarily chosen.
var decoder = new TextDecoder();
var offset = 0;
var linecount = 0;
var linenumber = 0;
var results = '';
var fr = new FileReader();
fr.onload = function() {
// Use stream:true in case we cut the file
// in the middle of a multi-byte character
results += decoder.decode(fr.result, {stream: true});
var lines = results.split('\n');
results = lines.pop(); // In case the line did not end yet.
linecount += lines.length;
if (linecount > maxlines) {
// Read too many lines? Truncate the results.
lines.length -= linecount - maxlines;
linecount = maxlines;
}
for (var i = 0; i < lines.length; ++i) {
forEachLine(lines[i] + '\n');
}
offset += CHUNK_SIZE;
seek();
};
fr.onerror = function() {
onComplete(fr.error);
};
seek();
function seek() {
if (linecount === maxlines) {
// We found enough lines.
onComplete(); // Done.
return;
}
if (offset !== 0 && offset >= file.size) {
// We did not find all lines, but there are no more lines.
forEachLine(results); // This is from lines.pop(), before.
onComplete(); // Done
return;
}
var slice = file.slice(offset, offset + CHUNK_SIZE);
fr.readAsArrayBuffer(slice);
}
}
&#13;
Read <input type="number" id="maxlines"> lines from
<input type="file" id="infile">.
<input type="button" id="start" value="Print lines to console">
&#13;
答案 1 :(得分:2)
Streams是功能!
whatwg团队正在研究关于可写+可读流的最后一点,并且很快就准备好了。但在此之前,您可以使用web-stream-polyfill。
他们正在研究从blob&#39}获取ReadableStream的方法。但我也创建了一种方法,以流式方式获取blob:[1]
昨天我还创建了一个Screw-FileReader的port来处理网络流
所以这可能很简单:
// Simulate a file
var csv =
`apple,1,$1.00
banana,4,$0.20
orange,3,$0.79`
var file = new Blob([csv])
var n = 0
var controller
var decoder = new TextDecoder
var stdout = new WritableStream({
start(c) {
controller = c
},
write(chunk, a) {
// Calling controller.error will also put the byLine in an errored state
// Causing the file stream to stop reading more data also
if (n == 1) controller.error("don't need more lines")
chunk = decoder.decode(chunk)
console.log(`chunk[${n++}]: ${chunk}`)
}
})
file
.stream()
.pipeThrough(byLine())
// .pipeThrough(new TextDecoder) something like this will work eventually
.pipeTo(stdout)
&#13;
<script src="https://cdn.rawgit.com/creatorrr/web-streams-polyfill/master/dist/polyfill.min.js"></script>
<script src="https://cdn.rawgit.com/jimmywarting/Screw-FileReader/master/index.js"></script>
<!-- after a year or so you only need byLine -->
<script src="https://cdn.rawgit.com/jimmywarting/web-byline/master/index.js"></script>
&#13;
答案 2 :(得分:1)
我需要在浏览器中读取250MB utf-8编码的文件。我的解决方案是编写TextReader类之类的C#,该类为我提供了类似行为的异步流。
TextReader类:
class TextReader {
CHUNK_SIZE = 8192000; // I FOUND THIS TO BE BEST FOR MY NEEDS, CAN BE ADJUSTED
position = 0;
length = 0;
byteBuffer = new Uint8Array(0);
lines = [];
lineCount = 0;
lineIndexTracker = 0;
fileReader = new FileReader();
textDecoder = new TextDecoder(`utf-8`);
get allCachedLinesAreDispatched() {
return !(this.lineIndexTracker < this.lineCount);
}
get blobIsReadInFull() {
return !(this.position < this.length);
}
get bufferIsEmpty() {
return this.byteBuffer.length === 0;
}
get endOfStream() {
return this.blobIsReadInFull && this.allCachedLinesAreDispatched && this.bufferIsEmpty;
}
constructor(blob) {
this.blob = blob;
this.length = blob.size;
}
blob2arrayBuffer(blob) {
return new Promise((resolve, reject) => {
this.fileReader.onerror = reject;
this.fileReader.onload = () => {
resolve(this.fileReader.result);
};
this.fileReader.readAsArrayBuffer(blob);
});
}
read(offset, count) {
return new Promise(async (resolve, reject) => {
if (!Number.isInteger(offset) || !Number.isInteger(count) || count < 1 || offset < 0 || offset > this.length - 1) {
resolve(new ArrayBuffer(0));
return
}
let endIndex = offset + count;
if (endIndex > this.length) endIndex = this.length;
let blobSlice = this.blob.slice(offset, endIndex);
resolve(await this.blob2arrayBuffer(blobSlice));
});
}
readLine() {
return new Promise(async (resolve, reject) => {
if (!this.allCachedLinesAreDispatched) {
resolve(this.lines[this.lineIndexTracker++] + `\n`);
return;
}
while (!this.blobIsReadInFull) {
let arrayBuffer = await this.read(this.position, this.CHUNK_SIZE);
this.position += arrayBuffer.byteLength;
let tempByteBuffer = new Uint8Array(this.byteBuffer.length + arrayBuffer.byteLength);
tempByteBuffer.set(this.byteBuffer);
tempByteBuffer.set(new Uint8Array(arrayBuffer), this.byteBuffer.length);
this.byteBuffer = tempByteBuffer;
let lastIndexOfLineFeedCharacter = this.byteBuffer.lastIndexOf(10); // LINE FEED CHARACTER (\n) IS ONE BYTE LONG IN UTF-8 AND IS 10 IN ITS DECIMAL FORM
if (lastIndexOfLineFeedCharacter > -1) {
let lines = this.textDecoder.decode(this.byteBuffer).split(`\n`);
this.byteBuffer = this.byteBuffer.slice(lastIndexOfLineFeedCharacter + 1);
let firstLine = lines[0];
this.lines = lines.slice(1, lines.length - 1);
this.lineCount = this.lines.length;
this.lineIndexTracker = 0;
resolve(firstLine + `\n`);
return;
}
}
if (!this.bufferIsEmpty) {
let line = this.textDecoder.decode(this.byteBuffer);
this.byteBuffer = new Uint8Array(0);
resolve(line);
return;
}
resolve(null);
});
}
}
用法:
document.getElementById("read").onclick = async () => {
let file = document.getElementById("fileInput").files[0];
let textReader = new TextReader(file);
while(true) {
let line = await textReader.readLine();
if(line === null) break;
// PROCESS LINE
}
// OR
while (!textReader.endOfStream) {
let line = await textReader.readLine();
// PROCESS LINE
}
};
性能:
我能够读取大约250秒内由250个utf-8编码的文本文件,该文件由1,398,258行组成,并且JS堆大小不超过20MB。相比之下,如果我一次性读取同一文件,然后用 \ n 分割结果字符串,则仍然需要约1.5秒的时间,但是JS Heap的拍摄速度为230MB。