我正在编写脚本来处理(非常大)文件,反复取消对象直到EOF。我想对文件进行分区,并在单独的进程(在云中)进行分解并处理单独的部分。
但是我的分区器不是智能的,它不知道文件中pickle对象之间的边界(因为这些边界取决于被pickle的对象类型等)。
有没有办法扫描文件中的“开始腌制对象”哨兵?天真的方法是尝试在连续的字节偏移处进行取消连接,直到对象被成功腌制,但这会产生意外错误。似乎对于某些输入组合,unpickler会失去同步并且不会为文件的其余部分返回任何内容(请参阅下面的代码)。
import cPickle
import os
def stream_unpickle(file_obj):
while True:
start_pos = file_obj.tell()
try:
yield cPickle.load(file_obj)
except (EOFError, KeyboardInterrupt):
break
except (cPickle.UnpicklingError, ValueError, KeyError, TypeError, ImportError):
file_obj.seek(start_pos+1, os.SEEK_SET)
if __name__ == '__main__':
import random
from StringIO import StringIO
# create some data
sio = StringIO()
[cPickle.dump(random.random(), sio, cPickle.HIGHEST_PROTOCOL) for _ in xrange(1000)]
sio.flush()
# read from subsequent offsets and find discontinuous jumps in object count
size = sio.tell()
last_count = None
for step in xrange(size):
sio.seek(step, os.SEEK_SET)
count = sum(1 for _ in stream_unpickle(file_obj))
if last_count is None or count == last_count - 1:
last_count = count
elif count != last_count:
# if successful, these should never print (but they do...)
print '%d elements read from byte %d' % (count, step)
print '(%d elements read from byte %d)' % (last_count, step-1)
last_count = count
答案 0 :(得分:1)
pickletools 模块有一个显示操作码的 dis 函数。它表明您可以扫描一个STOP操作码:
>>> import pickle, pickletools, StringIO
>>> s = StringIO.StringIO()
>>> pickle.dump('abc', s)
>>> p = s.getvalue()
>>> pickletools.dis(p)
0: S STRING 'abc'
7: p PUT 0
10: . STOP
highest protocol among opcodes = 0
注意,使用STOP操作码有点棘手,因为代码长度可变,但它可以作为关于截止点的有用提示。
如果您控制另一端的酸洗步骤,那么您可以通过添加自己的明确替代分隔符来改善这种情况:
>>> sep = '\xDE\xAD\xBE\xEF'
>>> s = StringIO.StringIO()
>>> pickle.dump('abc', s)
>>> s.write(sep)
>>> pickle.dump([10, 20], s)
>>> s.write(sep)
>>> pickle.dump('def', s)
>>> s.write(sep)
>>> pickle.dump([30, 40], s)
>>> p = s.getvalue()
在打开包装之前,使用已知的分隔符拆分成单独的泡菜:
>>> for pick in p.split(sep):
print pickle.loads(pick)
abc
[10, 20]
def
[30, 40]
答案 1 :(得分:0)
在pickled文件中,一些操作码有一个参数 - 一个跟在操作码之后的数据值。数据值的长度不同,并且可以包含与操作码相同的字节。因此,如果您从任意位置开始读取文件,则无法知道您是在查看操作码还是在查询中间。您必须从头开始读取文件并解析操作码。
我编写了这个函数,它从文件中跳过一个pickle,即读取它并解析操作码,但不构造对象。对于我的一些文件,它似乎比cPickle.loads
略快。你可以在C中重写这个以获得更快的速度。 (经过适当测试后)
然后,您可以对整个文件进行一次传递,以获得每个pickle的搜索位置。
from pickletools import code2op, UP_TO_NEWLINE, TAKEN_FROM_ARGUMENT1, TAKEN_FROM_ARGUMENT4
from marshal import loads as mloads
def skip_pickle(f):
"""Skip one pickle from file.
'f' is a file-like object containing the pickle.
"""
while True:
code = f.read(1)
if not code:
raise EOFError
opcode = code2op[code]
if opcode.arg is not None:
n = opcode.arg.n
if n > 0:
f.read(n)
elif n == UP_TO_NEWLINE:
f.readline()
elif n == TAKEN_FROM_ARGUMENT1:
n = ord(f.read(1))
f.read(n)
elif n == TAKEN_FROM_ARGUMENT4:
n = mloads('i' + f.read(4))
f.read(n)
if code == '.':
break
答案 2 :(得分:0)
很抱歉回答我自己的问题,感谢@RaymondHettinger关于添加哨兵的想法。
这对我有用。我创建了使用标记'#S'
后跟每个'记录'开头的数据块长度的读者和作者。作者必须注意在正在编写的数据中发现'#'
的任何出现并加倍(进入'##'
)。然后,阅读器使用后视式正则表达式来查找与原始流中可能存在的任何匹配值不同的标记,并验证此标记与后续标记之间的字节数。
RecordWriter是一个上下文管理器(因此,如果需要,可以将对write()的多次调用封装到单个记录中)。 RecordReader是一个生成器。
不确定这是如何表现的。欢迎任何更快/更优雅的解决方案。
import re
import cPickle
from functools import partial
from cStringIO import StringIO
SENTINEL = '#S'
# when scanning look for #S, but NOT ##S
sentinel_pattern = '(?<!#)#S' # uses negative look-behind
sentinel_re = re.compile(sentinel_pattern)
find_sentinel = sentinel_re.search
# when writing replace single # with double ##
write_pattern = '#'
write_re = re.compile(write_pattern)
fix_write = partial(write_re.sub, '##')
# when reading, replace double ## with single #
read_pattern = '##'
read_re = re.compile(read_pattern)
fix_read = partial(read_re.sub, '#')
class RecordWriter(object):
def __init__(self, stream):
self._stream = stream
self._write_buffer = None
def __enter__(self):
self._write_buffer = StringIO()
return self
def __exit__(self, et, ex, tb):
if self._write_buffer.tell():
self._stream.write(SENTINEL) # start
cPickle.dump(self._write_buffer.tell(), self._stream, cPickle.HIGHEST_PROTOCOL) # byte length of user's original data
self._stream.write(fix_write(self._write_buffer.getvalue()))
self._write_buffer = None
return False
def write(self, data):
if not self._write_buffer:
raise ValueError("Must use StreamWriter as a context manager")
self._write_buffer.write(data)
class BadBlock(Exception): pass
def verify_length(block):
fobj = StringIO(block)
try:
stated_length = cPickle.load(fobj)
except (ValueError, IndexError, cPickle.UnpicklingError):
raise BadBlock
data = fobj.read()
if len(data) != stated_length:
raise BadBlock
return data
def RecordReader(stream):
' Read one record '
accum = StringIO()
seen_sentinel = False
data = ''
while True:
m = find_sentinel(data)
if not m:
if seen_sentinel:
accum.write(data)
data = stream.read(80)
if not data:
if accum.tell():
try: yield verify_length(fix_read(accum.getvalue()))
except BadBlock: pass
return
else:
if seen_sentinel:
accum.write(data[:m.start()])
try: yield verify_length(fix_read(accum.getvalue()))
except BadBlock: pass
accum = StringIO()
else:
seen_sentinel = True
data = data[m.end():] # toss
if __name__ == '__main__':
import random
stream = StringIO()
data = [str(random.random()) for _ in xrange(3)]
# test with a string containing sentinel and almost-sentinel
data.append('abc12#jeoht38#SoSooihetS#')
count = len(data)
for i in data:
with RecordWriter(stream) as r:
r.write(i)
size = stream.tell()
start_pos = random.random() * size
stream.seek(start_pos, os.SEEK_SET)
read_data = [s for s in RecordReader(stream)]
print 'Original data: ', data
print 'After seeking to %d, RecordReader returned: %s' % (start_pos, read_data)