我看过几篇帖子,包括this一篇。但没有人帮忙。
这是我目前分割文件
的python代码我的输入文件大小为15G,我将其分成128MB。我的电脑有8G内存
import sys
def read_line(f_object,terminal_byte):
line = ''.join(iter(lambda:f_object.read(1),terminal_byte))
line+="\x01"
return line
def read_lines(f_object,terminal_byte):
tmp = read_line(f_object,terminal_byte)
while tmp:
yield tmp
tmp = read_line(f_object,terminal_byte)
def make_chunks(f_object,terminal_byte,max_size):
current_chunk = []
current_chunk_size = 0
for line in read_lines(f_object,terminal_byte):
current_chunk.append(line)
current_chunk_size += len(line)
if current_chunk_size > max_size:
yield "".join(current_chunk)
current_chunk = []
current_chunk_size = 0
if current_chunk:
yield ''.join(current_chunk)
inputfile=sys.argv[1]
with open(inputfile,"rb") as f_in:
for i,chunk in enumerate(make_chunks(f_in, bytes(chr(1)),1024*1000*128)):
with open("out%d.txt"%i,"wb") as f_out:
f_out.write(chunk)
当我执行脚本时,出现以下错误:
Traceback (most recent call last):
File "splitter.py", line 30, in <module>
for i,chunk in enumerate(make_chunks(f_in, bytes(chr(1)),1024*1000*128)):
File "splitter.py", line 17, in make_chunks
for line in read_lines(f_object,terminal_byte):
File "splitter.py", line 12, in read_lines
tmp = read_line(f_object,terminal_byte)
File "splitter.py", line 4, in read_line
line = ''.join(iter(lambda:f_object.read(1),terminal_byte))
MemoryError
答案 0 :(得分:1)
问题:将大文件拆分为较小的文件
而不是找到每一个\x01
,而只是在最后chunk
中执行此操作
将Filepointer重置为Last found offset+1
的{{1}},然后在当前块文件中继续或写入\x01
,并在下一个块文件中继续写入offset
的剩余部分。
注意:您的
chunk
应为chunk_size
或其倍数。
如果将io.DEFAULT_BUFFER_SIZE
提高到高,则不会获得加速 阅读此相关的SO质量保证:Default buffer size for a file
我的示例显示了重置Filepointer的用法,例如:
chunk_size
<强>输出强>:
import io large_data = b"""Lorem ipsum\x01dolor sit\x01sadipscing elitr, sed\x01labore et\x01dolores et ea rebum.\x01magna aliquyam erat,\x01""" def split(chunk_size, split_size): with io.BytesIO(large_data) as fh_in: _size = 0 # Used to verify chunked writes result_data = io.BytesIO() while True: chunk = fh_in.read(chunk_size) print('read({})'.format(bytearray(chunk))) if not chunk: break _size += chunk_size if _size >= split_size: _size = 0 # Split on last 0x01 l = len(chunk) print('\tsplit_on_last_\\x01({})\t{}'.format(l, bytearray(chunk))) # Reverse iterate for p in range(l-1, -1, -1): c = chunk[p:p+1] if ord(c) == ord('\x01'): offset = l-(p+1) # Condition if \x01 is the Last Byte in chunk if offset == 0: print('\toffset={} write({})\t\t{}'.format(offset, l - offset, bytearray(chunk))) result_data.write(chunk) else: # Reset Fileppointer fh_in.seek(fh_in.tell()-offset) print('\toffset={} write({})\t\t{}'.format(offset, l-offset, bytearray(chunk[:-offset]))) result_data.write(chunk[:-offset]) break else: print('\twrite({}) {}'.format(chunk_size, bytearray(chunk))) result_data.write(chunk) print('INPUT :{}\nOUTPUT:{}'.format(large_data, result_data.getvalue())) if __name__ == '__main__': split(chunk_size=30, split_size=60)
使用Python测试:3.4.2