
时间:2014-10-16 01:22:56

标签: python sql out-of-memory pickle

我遇到的问题是我有一个非常大的pickle文件(2.6 Gb),我试图打开但每次这样做都会出现内存错误。我现在意识到我应该使用数据库存储所有信息,但现在已经太晚了。 pickle文件包含来自互联网的美国国会记录中的日期和文本(运行大约需要2周)。有什么方法可以访问我逐步转储到pickle文件中的信息,或者将pickle文件转换为sql数据库或其他我可以打开而不必重新输入所有数据的方法。我真的不想再花2周时间重新抓取国会记录并将数据输入数据库。




def save_objects(objects): 
    with open('objects.pkl', 'wb') as output: 
        pickle.dump(objects, output, pickle.HIGHEST_PROTOCOL)

def Main():   
    file = open("datafile.txt", "w")
    with open('links2.txt', 'rb') as infile:
        for link in infile: 
            print link
            if text != None:


def Main():
    file= open('objects1.pkl', 'rb') 
    object = pickle.load(file)

看起来你有点腌渍! ;-)。希望在此之后,你永远不会使用PICKLE EVER。它不是一个非常好的数据存储格式。


class Document(object): # <-- object part is very important! If it's not there, the format is different!
    def __init__(self, title, date, text): # assuming all strings
        self.title = title
        self.date = date
        self.text = text


d = [Document(title='foo', text='foo is good', date='1/1/1'), Document(title='bar', text='bar is better', date='2/2/2'), Document(title='baz', text='no one likes baz :(', date='3/3/3')]

使用格式2(Python {x的pickle.HIGHEST_PROTOCOL)腌制它

>>> s = pickle.dumps(d, 2)
>>> s
'\x80\x02]q\x00(c__main__\nDocument\nq\x01)\x81q\x02}q\x03(U\x04dateq\x04U\x051/1/1q\x05U\x04textq\x06U\x0bfoo is goodq\x07U\x05titleq\x08U\x03fooq\tubh\x01)\x81q\n}q\x0b(h\x04U\x052/2/2q\x0ch\x06U\rbar is betterq\rh\x08U\x03barq\x0eubh\x01)\x81q\x0f}q\x10(h\x04U\x053/3/3q\x11h\x06U\x13no one likes baz :(q\x12h\x08U\x03bazq\x13ube.'


>>> pickletools.dis(s)
    0: \x80 PROTO      2
    2: ]    EMPTY_LIST
    3: q    BINPUT     0
    5: (    MARK
    6: c        GLOBAL     '__main__ Document'
   25: q        BINPUT     1
   27: )        EMPTY_TUPLE
   28: \x81     NEWOBJ
   29: q        BINPUT     2
   31: }        EMPTY_DICT
   32: q        BINPUT     3
   34: (        MARK
   35: U            SHORT_BINSTRING 'date'
   41: q            BINPUT     4
   43: U            SHORT_BINSTRING '1/1/1'
   50: q            BINPUT     5
   52: U            SHORT_BINSTRING 'text'
   58: q            BINPUT     6
   60: U            SHORT_BINSTRING 'foo is good'
   73: q            BINPUT     7
   75: U            SHORT_BINSTRING 'title'
   82: q            BINPUT     8
   84: U            SHORT_BINSTRING 'foo'
   89: q            BINPUT     9
   91: u            SETITEMS   (MARK at 34)
   92: b        BUILD
   93: h        BINGET     1
   95: )        EMPTY_TUPLE
   96: \x81     NEWOBJ
   97: q        BINPUT     10
   99: }        EMPTY_DICT
  100: q        BINPUT     11
  102: (        MARK
  103: h            BINGET     4
  105: U            SHORT_BINSTRING '2/2/2'
  112: q            BINPUT     12
  114: h            BINGET     6
  116: U            SHORT_BINSTRING 'bar is better'
  131: q            BINPUT     13
  133: h            BINGET     8
  135: U            SHORT_BINSTRING 'bar'
  140: q            BINPUT     14
  142: u            SETITEMS   (MARK at 102)
  143: b        BUILD
  144: h        BINGET     1
  146: )        EMPTY_TUPLE
  147: \x81     NEWOBJ
  148: q        BINPUT     15
  150: }        EMPTY_DICT
  151: q        BINPUT     16
  153: (        MARK
  154: h            BINGET     4
  156: U            SHORT_BINSTRING '3/3/3'
  163: q            BINPUT     17
  165: h            BINGET     6
  167: U            SHORT_BINSTRING 'no one likes baz :('
  188: q            BINPUT     18
  190: h            BINGET     8
  192: U            SHORT_BINSTRING 'baz'
  197: q            BINPUT     19
  199: u            SETITEMS   (MARK at 153)
  200: b        BUILD
  201: e        APPENDS    (MARK at 5)
  202: .    STOP

看起来很复杂!但实际上,它并没有那么糟糕。 pickle基本上是一个堆栈机器,你看到的每个ALL_CAPS标识符都是操作码,它以某种方式操作内部“堆栈”进行解码。如果我们试图解析一些复杂的结构,这将更为重要,但幸运的是,我们只是制作一个基本元组的简单列表。所有这些“代码”正在做的是在堆栈上构造一堆对象,然后将整个堆栈推送到列表中。

我们需要关注的一件事是你看到散落的'BINPUT'/'BINGET'操作码。基本上,这些是用于“memoization”,以减少数据占用,pickle使用BINPUT <id>保存字符串,然后如果它们再次出现,而不是重新转储它们,只需添加{{1}从缓存中检索它们。

另外,另一个并发症!不仅仅是BINGET <id> - 字符串&gt;正常SHORT_BINSTRING 256字节,以及一些有趣的unicode变体。我只是假设你使用Python 2和所有ASCII字符串。再次,如果这不是一个正确的假设,请评论。

好的,所以我们需要流式传输文件,直到我们点击'\ 81'字节(BINSTRING)。然后,我们需要向前扫描,直到我们点击'('NEWOBJ)字符。然后,在我们点击'u'(MARK)之前,我们读取了一对键/值字符串 - 那里应该是3对,每个领域一个。




pickledata = '\x80\x02]q\x00(c__main__\nDocument\nq\x01)\x81q\x02}q\x03(U\x04dateq\x04U\x051/1/1q\x05U\x04textq\x06U\x0bfoo is goodq\x07U\x05titleq\x08U\x03fooq\tubh\x01)\x81q\n}q\x0b(h\x04U\x052/2/2q\x0ch\x06T\x14\x05\x00\x00bar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterbar is betterq\rh\x08U\x03barq\x0eubh\x01)\x81q\x0f}q\x10(h\x04U\x053/3/3q\x11h\x06U\x13no one likes baz :(q\x12h\x08U\x03bazq\x13ube.'

# simulate a file here
import StringIO
picklefile = StringIO.StringIO(pickledata)

import pickle # just for opcode names
import struct # binary unpacking

def try_memo(f, v, cache):
    opcode = f.read(1)
    if opcode == pickle.BINPUT:
        cache[f.read(1)] = v
    elif opcode == pickle.LONG_BINPUT:
        print 'skipping LONG_BINPUT to save memory, LONG_BINGET will probably not be used'
        f.seek(f.tell() - 1) # rewind

def try_read_string(f, opcode, cache):
    if opcode in [ pickle.SHORT_BINSTRING, pickle.BINSTRING ]:
        length_type = 'b' if opcode == pickle.SHORT_BINSTRING else 'i'
        str_length = struct.unpack(length_type, f.read(struct.calcsize(length_type)))[0]
        value = f.read(str_length)
        try_memo(f, value, memo_cache)
        return value
    elif opcode == pickle.BINGET:
        return memo_cache[f.read(1)]
    elif opcide == pickle.LONG_BINGET:
        raise Exception('Unexpected LONG_BINGET? Key ' + f.read(4))
        raise Exception('Invalid opcode ' + opcode + ' at pos ' + str(f.tell()))

memo_cache = {}
while True:
    c = picklefile.read(1)
    if c == pickle.NEWOBJ:
        while picklefile.read(1) != pickle.MARK:
            pass # scan forward to field instantiation
        fields = {}
        while True:
            opcode = picklefile.read(1)
            if opcode == pickle.SETITEMS:
            key = try_read_string(picklefile, opcode, memo_cache)
            value = try_read_string(picklefile, picklefile.read(1), memo_cache)
            fields[key] = value
        print 'Document', fields
        # insert to sqllite
    elif c == pickle.STOP:

def save_objects(objects): 
    with open('objects.pkl', 'ab') as output:  # Note: `ab` appends the data
        pickle.dump(objects, output, pickle.HIGHEST_PROTOCOL)

def Main():
    #objects=[] <-- lose the objects list
    with open('links2.txt', 'rb') as infile:
        for link in infile: 


import pickle
with open('objects.pkl', 'rb') as pickle_file:
        while True:
            article = pickle.load(pickle_file)
            print article
    except EOFError:


  • 尝试cPickle。这可能有所帮助。
  • 尝试使用流媒体
  • 在具有大量RAM的64位环境中读取您的pickle文件
  • 重新抓取原始数据,这次实际上是以递增方式存储数据,或将其存储在数据库中。 如果没有不断重写您的pickle输出文件的低效率,这次您的爬行可能会明显加快。

我最近有一个非常相似的案例-一个11 GB的泡菜。我没有尝试以增量方式在计算机上加载它,因为我没有足够的时间来实现自己的增量加载器或针对我的情况优化现有的加载器。

