bz2
模块提供标准open()
方法,可以从中调用readline()
。但是,我的情况是我有一个流(指向大量数据)我想要动态解压缩行。我目前的实现如下,但我知道必须有一个更简洁的方法来做到这一点。
import bz2
import csv
BZ2_BUFFER = ''
BZ2_DECOMPRESSOR = None
BZ2_FILE = None
BZ2_READ_SIZE = 100 * 1024
def bz2_csv_rows(fp):
global BZ2_BUFFER, BZ2_DECOMPRESSOR, BZ2_FILE, BZ2_READ_SIZE
BZ2_BUFFER = ''
BZ2_DECOMPRESSOR = bz2.BZ2Decompressor()
BZ2_FILE = fp
for row in csv.reader(iter(bz2_line_reader, b'')):
yield row
def bz2_line_reader():
global BZ2_BUFFER, BZ2_DECOMPRESSOR, BZ2_FILE, BZ2_READ_SIZE
if BZ2_BUFFER is None:
return None
while '\n' not in BZ2_BUFFER:
bindata = BZ2_FILE.read(BZ2_READ_SIZE)
try:
data = BZ2_DECOMPRESSOR.decompress(bindata)
except EOFError:
break
except IOError:
pass
BZ2_BUFFER += data
if len(data) < BZ2_READ_SIZE:
BZ2_FILE = None
break
i = BZ2_BUFFER.find('\n')
if i is None or i < 0:
line = BZ2_BUFFER
BZ2_BUFFER = None
return line
line = BZ2_BUFFER[:i]
BZ2_BUFFER = BZ2_BUFFER[i + 1:]
return line
思想?
答案 0 :(得分:5)
这里的内容有点简洁,而且(在我看来)它更具可读性,并且摆脱了代码使用的所有令人讨厌的全局变量:
import bz2
import csv
from functools import partial
class BZ2_CSV_LineReader(object):
def __init__(self, filename, buffer_size=4*1024):
self.filename = filename
self.buffer_size = buffer_size
def readlines(self):
with open(self.filename, 'rb') as file:
for row in csv.reader(self._line_reader(file)):
yield row
def _line_reader(self, file):
buffer = ''
decompressor = bz2.BZ2Decompressor()
reader = partial(file.read, self.buffer_size)
for bindata in iter(reader, b''):
block = decompressor.decompress(bindata).decode('utf-8')
buffer += block
if '\n' in buffer:
lines = buffer.splitlines(True)
if lines:
buffer = '' if lines[-1].endswith('\n') else lines.pop()
for line in lines:
yield line
if __name__ == '__main__':
bz2_csv_filename = 'test_csv.bz2'
for row in BZ2_CSV_LineReader(bz2_csv_filename).readlines():
print(row)
答案 1 :(得分:0)
也许有用:我使用Python 3,并且有一个很大的csv.bz2文件。 我是这样处理的:
import bz2
import csv
def bz2_csv_rows(fp):
with bz2.open(fp, mode='rt', newline='') as bzfp:
for row in csv.reader(bzfp):
yield row
主要功能是在文本模式下打开流:在调用bz2.open()中使用mode ='rt'代替在二进制模式下手动搜索“ \ n”。但是我不确定这是否适用于非物理文件。