我正在尝试阅读一个巨大的gzip压缩文件并处理每一行。
我尝试了两种不同的实现:
通常推荐的实施方案比替代慢100倍。我错了还是Popen().stdout
的实施真的很糟糕? (它似乎逐个字符地读取文件)。
from time import time
from subprocess import Popen, PIPE
# We generate a csv file with 1M lines of 3D coordinates
from random import random
import os
N = 1000000
PATH = 'test'
GZIP_PATH = 'test.gz'
with open(PATH, 'w') as datafile:
for i in xrange(N):
datafile.write('{0}, {1}, {2}\n'.format(random(), random(), random()))
try:
os.remove(GZIP_PATH)
except:
pass
Popen(['gzip', PATH]).wait()
# We want to process the file line by line
# We start with a textbook implementation
def simple_generator(file):
line = file.readline()
while line:
yield line[:-1]
line = file.readline()
with Popen(['gunzip', '-c', GZIP_PATH], stdout=PIPE).stdout as datafile:
t = time()
i = 0
for line in simple_generator(datafile):
i+=1 # process the line
print time()-t
print i
# We start a lower level implementation
BLOCK_SIZE = 1<<16
def fast_generator(file):
rem = ''
block = file.read(BLOCK_SIZE)
while block:
lines = block.split('\n')
lines[0] = rem+lines[0]
for i in xrange(0,len(lines)-1):
yield lines[i]
rem = lines[-1]
block = file.read(BLOCK_SIZE)
with Popen(['gunzip', '-c', GZIP_PATH], stdout=PIPE).stdout as datafile:
t = time()
i = 0
for line in fast_generator(datafile):
i+=1 # process the line
print time()-t
print i
# Output:
#
# 34.0195429325
# 1000000
# 0.232397794724
# 1000000
#
# The second implementation is 100x faster!
答案 0 :(得分:1)
正确的实施应该是使用Popen
bufsize=-1
with Popen(['gunzip', '-c', GZIP_PATH], stdout=PIPE, bufsize=-1).stdout as datafile:
t = time()
i = 0
for line in simple_generator(datafile):
i+=1 # process the line
print time()-t
print i
但我很惊讶默认行为是bufsize=0
。