下面的代码完全读取用户选择的输入文件。这对于非常大(> 10 GB)的文件需要大量内存。我需要逐行读取文件。
在Pyodide中如何一次读取一行文件?
如果我改成这个:
在Pyodide中如何一次读取一行文件?
<!doctype html>
<html>
<head>
<script src="https://cdn.jsdelivr.net/pyodide/v0.22.1/full/pyodide.js"></script>
</head>
<body>
<button>Analyze input</button>
<script type="text/javascript">
async function main() {
// Get the file contents into JS
const [fileHandle] = await showOpenFilePicker();
const fileData = await fileHandle.getFile();
const contents = await fileData.text();
// Create the Python convert toy function
let pyodide = await loadPyodide();
let convert = pyodide.runPython(`
from pyodide.ffi import to_js
def convert(contents):
return to_js(contents.lower())
convert
`);
let result = convert(contents);
console.log(result);
const blob = new Blob([result], {type : 'application/text'});
let url = window.URL.createObjectURL(blob);
var downloadLink = document.createElement("a");
downloadLink.href = url;
downloadLink.text = "Download output";
downloadLink.download = "out.txt";
document.body.appendChild(downloadLink);
}
const button = document.querySelector('button');
button.addEventListener('click', main);
</script>
</body>
</html>
这段代码来自于这个回答中关于“从用户文件系统选择并读取文件”的问题。
基于 rth的答案,我使用了以下代码。 它仍然有两个问题:
- 块会将某些行分成几部分,如示例输入文件所示,该文件每行有100个字符。控制台日志(如下所示)显示块并非总是在换行符处断开行(因此,块中的行不是在换行符处中断)。
- 我无法将变量
result
写入输出文件,该文件可供用户下载(请参见下文,为了演示目的,它被替换为虚拟字符串'result'
)。
<!doctype html>
<html>
<head>
<script src="https://cdn.jsdelivr.net/pyodide/v0.22.1/full/pyodide.js"></script>
</head>
<body>
<button>Analyze input</button>
<script type="text/javascript">
async function main() {
// Create the Python convert toy function
let pyodide = await loadPyodide();
let convert = pyodide.runPython(`
from pyodide.ffi import to_js
def convert(contents):
for line in contents.split('\\n'):
print(len(line))
return to_js(contents.lower())
convert
`);
// Get the file contents into JS
const bytes_func = pyodide.globals.get('bytes');
const [fileHandle] = await showOpenFilePicker();
let fh = await fileHandle.getFile()
const stream = fh.stream();
const reader = stream.getReader();
// Do a loop until end of file
while( true ) {
const { done, value } = await reader.read();
if( done ) { break; }
handleChunk( value );
}
console.log( "all done" );
function handleChunk( buf ) {
console.log( "received a new buffer", buf.byteLength );
let result = convert(bytes_func(buf).decode('utf-8'));
}
const blob = new Blob(['result'], {type : 'application/text'});
let url = window.URL.createObjectURL(blob);
var downloadLink = document.createElement("a");
downloadLink.href = url;
downloadLink.text = "Download output";
downloadLink.download = "out.txt";
document.body.appendChild(downloadLink);
}
const button = document.querySelector('button');
button.addEventListener('click', main);
</script>
</body>
</html>
假设有一个每行包含100个字符的输入文件:
perl -le 'for (1..1e5) { print "0" x 100 }' > test_100x1e5.txt
我得到了这个控制台日志输出,表明行不是在换行符处断开的:
received a new buffer 65536
648pyodide.asm.js:10 100
pyodide.asm.js:10 88
read_write_bytes_func.html:41 received a new buffer 2031616
pyodide.asm.js:10 12
20114pyodide.asm.js:10 100
pyodide.asm.js:10 89
read_write_bytes_func.html:41 received a new buffer 2097152
pyodide.asm.js:10 11
20763pyodide.asm.js:10 100
pyodide.asm.js:10 77
read_write_bytes_func.html:41 received a new buffer 2097152
pyodide.asm.js:10 23
20763pyodide.asm.js:10 100
pyodide.asm.js:10 65
read_write_bytes_func.html:41 received a new buffer 2097152
pyodide.asm.js:10 35
20763pyodide.asm.js:10 100
pyodide.asm.js:10 53
read_write_bytes_func.html:41 received a new buffer 1711392
pyodide.asm.js:10 47
16944pyodide.asm.js:10 100
pyodide.asm.js:10 0
read_write_bytes_func.html:37 all done
如果我改成这个:
const blob = new Blob(['result'], {type : 'application/text'});
变成这样:
const blob = new Blob([result], {type : 'application/text'});
然后我会收到错误:
Uncaught (in promise) ReferenceError: result is not defined
at HTMLButtonElement.main (read_write_bytes_func.html:45:34)
const [fileHandle] = await showOpenFilePicker(); const fileData = await fileHandle.getFile(); const contents = await fileData.text();
- Timur Shtatland