## Versions
print("Versions", "____________", sys.version.replace("\n","\t"), pd.__version__, sep="\n")
#Versions
#____________
#3.6.2 |Anaconda custom (x86_64)| (default, Jul 20 2017, 13:14:59) #[GCC 4.2.1 Compatible Apple LLVM 6.0 (clang-600.0.57)]
#0.21.1
# How I wrote the object
def write_object(obj, path, compression="bz2", protocol=pickle.HIGHEST_PROTOCOL, **args):
"""
Extensions:
pickle ==> .pkl
gzipped-pickle ==> .pgz
bzip2-pickle ==> .pbz2
"""
if compression is not None:
if compression == "bz2":
f = bz2.BZ2File(path, "wb")
if compression == "gzip":
f = gzip.GzipFile(path, "wb")
else:
f = open(path, "wb")
pickle.dump(obj, f, protocol=protocol, **args)
f.close()
# How I'm trying to read it back in
path = "./Data/objects/counts.tsv.pbz2"
write_object(df, path) #df = pd.DataFrame of ints
f = bz2.open(path, "rb")
df_loaded = pickle.load(f)
f.close()
# ---------------------------------------------------------------------------
# OSError Traceback (most recent call last)
# <ipython-input-13-04a37c035163> in <module>()
# 1 path = "./Data/objects/counts.tsv.pbz2"
# 2 f = bz2.open(path, "rb")
# ----> 3 pickle.load(f)
# ~/anaconda/lib/python3.6/bz2.py in peek(self, n)
# 170 # always returns at least one byte (except at EOF), independent
# 171 # of the value of n
# --> 172 return self._buffer.peek(n)
# 173
# 174 def read(self, size=-1):
# ~/anaconda/lib/python3.6/_compression.py in readinto(self, b)
# 66 def readinto(self, b):
# 67 with memoryview(b) as view, view.cast("B") as byte_view:
# ---> 68 data = self.read(len(byte_view))
# 69 byte_view[:len(data)] = data
# 70 return len(data)
# ~/anaconda/lib/python3.6/_compression.py in read(self, size)
# 101 else:
# 102 rawblock = b""
# --> 103 data = self._decompressor.decompress(rawblock, size)
# 104 if data:
# 105 break
# OSError: Invalid data stream
该文件是285.5 MB
制表符分隔的整数表,其中包含~1.5M columns
和~100 rows
。 gzipped
文件大小为42.4 MB
,bz2-zipped pickled pd.DataFrame
为34.2 MB
。加载到pd.DataFrame
需要45分钟,这就是我序列化的原因。
以这种方式腌制对象的大小是否有限制?
我问的原因是我以完全相同的方式处理的其他计数表是完美打开的。文件大小为unprocessed .tsv = 148.9 MB
,gzipped = 24.8 MB
和bz2-zipped pickled .pbz2 = 19.8 MB
。
唯一类似的问题,但答案没有帮助: Python BZ2 IOError: invalid data stream