读取一个大型文本文件的前n行

Question

读取一个大型文本文件的前n行

15

我拥有的最小文件有超过850k行，每一行长度未知。目标是在浏览器中读取n行。完全读取它是不可能的。

这是HTML代码：<input type="file" name="file" id="file"> 和我的JS代码：

var n = 10;
var reader = new FileReader();
reader.onload = function(progressEvent) {
  // Entire file
  console.log(this.result);

  // By lines
  var lines = this.result.split('\n');
  for (var line = 0; line < n; line++) {
    console.log(lines[line]);
  }
};

显然，这里的问题在于它尝试先读取整个文件，然后按换行符拆分文件。所以无论n是多少，它都会尝试读取整个文件，当文件很大时最终什么都没有读取。

我该怎么做呢？

注：如果我能够console.log()每一行读取的内容，我愿意删除整个函数并重新开始。

*“每一行的长度未知”-> 意味着文件看起来像这样：

(0, (1, 2))
(1, (4, 5, 6))
(2, (7))
(3, (8))

编辑：

最好的方法就是像filereader api on big files那样，但我不知道如何修改它来读取文件的n行…

同时，使用Javascript中的Uint8Array转字符串，可以从那里开始实现：

var view = new Uint8Array(fr.result);
var string = new TextDecoder("utf-8").decode(view);
console.log("Chunk " + string);

但这可能无法将最后一行作为整体读取，那么你后面要如何确定这些行呢？例如，下面是它所打印的内容：

((7202), (u'11330875493', u'2554375661'))
((1667), (u'9079074735', u'6883914476',

- gsamaras

"...但那不应该有关系。" 你到底是怎么想的？没有行开始的索引和在给定索引处递增读取文件的能力，这绝对很重要。 - T.J. Crowder

@T.J.Crowder，我更新了我的问题并进行了澄清，也许我应该删除那个陈述，你是对的！ - gsamaras

需要稍微多一些上下文信息。您正在使用HTML和JavaScript。这是在Web浏览器中运行的JavaScript吗？还是这是作为类似于HTML POST的响应而执行的JavaScript？ - Alan

啊，忘了@Alan，已经更新了！在浏览器中。 - gsamaras

看看这个StackOverflow的答案，它很相似：https://dev59.com/fV8e5IYBdhLWcg3wYJiu它的要点是使用.slice，以块的形式读取数据。然后在处理每个块时进行操作。 - Alan

3个回答

5

我需要在浏览器中读取250MB的UTF-8编码文件。我的解决方案是编写类似于C# TextReader类的代码，以提供类似于异步流的行为。

TextReader类：

class TextReader {
    CHUNK_SIZE = 8192000; // I FOUND THIS TO BE BEST FOR MY NEEDS, CAN BE ADJUSTED
    position = 0;
    length = 0;

    byteBuffer = new Uint8Array(0);

    lines = [];
    lineCount = 0;
    lineIndexTracker = 0;

    fileReader = new FileReader();
    textDecoder = new TextDecoder(`utf-8`);

    get allCachedLinesAreDispatched() {
        return !(this.lineIndexTracker < this.lineCount);
    }

    get blobIsReadInFull() {
        return !(this.position < this.length);
    }

    get bufferIsEmpty() {
        return this.byteBuffer.length === 0;
    }

    get endOfStream() {
        return this.blobIsReadInFull && this.allCachedLinesAreDispatched && this.bufferIsEmpty;
    }

    constructor(blob) {
        this.blob = blob;
        this.length = blob.size;
    }

    blob2arrayBuffer(blob) {
        return new Promise((resolve, reject) => {
            this.fileReader.onerror = reject;
            this.fileReader.onload = () => {
                resolve(this.fileReader.result);
            };

            this.fileReader.readAsArrayBuffer(blob);
        });
    }

    read(offset, count) {
        return new Promise(async (resolve, reject) => {
            if (!Number.isInteger(offset) || !Number.isInteger(count) || count < 1 || offset < 0 || offset > this.length - 1) {
                resolve(new ArrayBuffer(0));
                return
            }

            let endIndex = offset + count;

            if (endIndex > this.length) endIndex = this.length;

            let blobSlice = this.blob.slice(offset, endIndex);

            resolve(await this.blob2arrayBuffer(blobSlice));
        });
    }

    readLine() {
        return new Promise(async (resolve, reject) => {

            if (!this.allCachedLinesAreDispatched) {
                resolve(this.lines[this.lineIndexTracker++] + `\n`);
                return;
            }

            while (!this.blobIsReadInFull) {
                let arrayBuffer = await this.read(this.position, this.CHUNK_SIZE);
                this.position += arrayBuffer.byteLength;

                let tempByteBuffer = new Uint8Array(this.byteBuffer.length + arrayBuffer.byteLength);
                tempByteBuffer.set(this.byteBuffer);
                tempByteBuffer.set(new Uint8Array(arrayBuffer), this.byteBuffer.length);

                this.byteBuffer = tempByteBuffer;

                let lastIndexOfLineFeedCharacter = this.byteBuffer.lastIndexOf(10); // LINE FEED CHARACTER (\n) IS ONE BYTE LONG IN UTF-8 AND IS 10 IN ITS DECIMAL FORM

                if (lastIndexOfLineFeedCharacter > -1) {
                    let lines = this.textDecoder.decode(this.byteBuffer).split(`\n`);
                    this.byteBuffer = this.byteBuffer.slice(lastIndexOfLineFeedCharacter + 1);

                    let firstLine = lines[0];

                    this.lines = lines.slice(1, lines.length - 1);
                    this.lineCount = this.lines.length;
                    this.lineIndexTracker = 0;

                    resolve(firstLine + `\n`);
                    return;
                }
            }

            if (!this.bufferIsEmpty) {
                let line = this.textDecoder.decode(this.byteBuffer);
                this.byteBuffer = new Uint8Array(0);
                resolve(line);
                return;
            }

            resolve(null);
        });
    }
}

用法：

document.getElementById("read").onclick = async () => {
    let file = document.getElementById("fileInput").files[0];
    let textReader = new TextReader(file);

    while(true) {
        let line = await textReader.readLine();
        if(line === null) break;
        // PROCESS LINE
    }

    // OR

    while (!textReader.endOfStream) {
        let line = await textReader.readLine();
        // PROCESS LINE
    }
};

性能：

我能够在 JS 堆大小不超过 20MB 的情况下，在大约 1.5 秒的时间内读取一个由 1,398,258 行组成、大小为 250MB 的 utf-8 编码文本文件。相比之下，如果我一次性读取同样的文件，然后将结果字符串按 \n 分割，它仍然需要大约 1.5 秒，但是 JS 堆会增加到 230MB。

- Maz T

2

流是一个特性！whatwg团队正在解决关于可写+可读流的最后一个问题，并且很快就会准备好。但在那之前，你可以使用web-stream-polyfill。他们正在研究一种从blob中获取ReadableStream的方法[1]。但我已经创建了一种以流式方式获取blob的方法：Screw-FileReader。

昨天，我还创建了一个简单的端口，用于处理Web流而不是node-byline。

所以这可能很简单，如下所示：

// Simulate a file
var csv =
`apple,1,$1.00
banana,4,$0.20
orange,3,$0.79`

var file = new Blob([csv])

var n = 0
var controller
var decoder = new TextDecoder
var stdout = new WritableStream({
  start(c) {
      controller = c
    },
    write(chunk, a) {
      // Calling controller.error will also put the byLine in an errored state
      // Causing the file stream to stop reading more data also
      if (n == 1) controller.error("don't need more lines")
      chunk = decoder.decode(chunk)
      console.log(`chunk[${n++}]: ${chunk}`)
    }
})

file
  .stream()
  .pipeThrough(byLine())
  // .pipeThrough(new TextDecoder) something like this will work eventually
  .pipeTo(stdout)

<script src="https://cdn.rawgit.com/creatorrr/web-streams-polyfill/master/dist/polyfill.min.js"></script>
<script src="https://cdn.rawgit.com/jimmywarting/Screw-FileReader/master/index.js"></script>

<!-- after a year or so you only need byLine -->
<script src="https://cdn.rawgit.com/jimmywarting/web-byline/master/index.js"></script>

- Endless

1

有趣的方法，不用说了！ :) - gsamaras

谢谢，期待这个功能的推出 :) - Endless

1

请勿鼓励使用带有外部输入的 innerHTML，因为它可能会引入安全漏洞。此外，document.body.innerHTML += 是不好的，因为它会强制重新解析整个文档。考虑改用 element.insertAdjacentText 或 document.createTextNode + element.appendChild。 - Rob W

他们已经花了一年时间在处理类似blob的ReadableStream上。但是仍然没有明显的进展。 - ﾤﾡ

网页内容由stack overflow 提供, 点击上面的

可以查看英文原文，
原文链接

- Rob W · Accepted Answer

该逻辑与我在filereader api on big files中的答案非常相似，只是您需要跟踪到目前为止处理的行数（以及最后读取的行，因为它可能还没有结束）。下一个示例适用于与UTF-8兼容的任何编码；如果您需要另一种编码，请查看 TextDecoder 构造函数的选项。

如果您确定输入是ASCII（或任何其他单字节编码），则也可以跳过使用 TextDecoder 并直接使用 FileReader 的 readAsText 方法。将输入读取为文本。

// This is just an example of the function below.
document.getElementById('start').onclick = function() {
    var file = document.getElementById('infile').files[0];
    if (!file) {
        console.log('No file selected.');
        return;
    }
    var maxlines = parseInt(document.getElementById('maxlines').value, 10);
    var lineno = 1;
    // readSomeLines is defined below.
    readSomeLines(file, maxlines, function(line) {
        console.log("Line: " + (lineno++) + line);
    }, function onComplete() {
        console.log('Read all lines');
    });
};

/**
 * Read up to and including |maxlines| lines from |file|.
 *
 * @param {Blob} file - The file to be read.
 * @param {integer} maxlines - The maximum number of lines to read.
 * @param {function(string)} forEachLine - Called for each line.
 * @param {function(error)} onComplete - Called when the end of the file
 *     is reached or when |maxlines| lines have been read.
 */
function readSomeLines(file, maxlines, forEachLine, onComplete) {
    var CHUNK_SIZE = 50000; // 50kb, arbitrarily chosen.
    var decoder = new TextDecoder();
    var offset = 0;
    var linecount = 0;
    var linenumber = 0;
    var results = '';
    var fr = new FileReader();
    fr.onload = function() {
        // Use stream:true in case we cut the file
        // in the middle of a multi-byte character
        results += decoder.decode(fr.result, {stream: true});
        var lines = results.split('\n');
        results = lines.pop(); // In case the line did not end yet.
        linecount += lines.length;
    
        if (linecount > maxlines) {
            // Read too many lines? Truncate the results.
            lines.length -= linecount - maxlines;
            linecount = maxlines;
        }
    
        for (var i = 0; i < lines.length; ++i) {
            forEachLine(lines[i] + '\n');
        }
        offset += CHUNK_SIZE;
        seek();
    };
    fr.onerror = function() {
        onComplete(fr.error);
    };
    seek();
    
    function seek() {
        if (linecount === maxlines) {
            // We found enough lines.
            onComplete(); // Done.
            return;
        }
        if (offset !== 0 && offset >= file.size) {
            // We did not find all lines, but there are no more lines.
            forEachLine(results); // This is from lines.pop(), before.
            onComplete(); // Done
            return;
        }
        var slice = file.slice(offset, offset + CHUNK_SIZE);
        fr.readAsArrayBuffer(slice);
    }
}

Read <input type="number" id="maxlines"> lines from
<input type="file" id="infile">.
<input type="button" id="start" value="Print lines to console">