我有一个带有两个数组的标准HDF5文件:一个是5000x5000的双精度数,另一个是20x300的整数。该文件很大,但没有那么大:
$ ls -lh
-rw-rw-r-- 1 . . 18M Jun 23 2014 trained.h5
我能够从python中的文件中获取数据,现在我想将此数据保存在独立于HDF5的文件中。我尝试了一种非常简单明了的方法:
from mrth import *
import h5py
import json,io
print( "\nreading weights and groups from the file %s"%sys.argv[1] )
data = h5py.File(sys.argv[1], 'r')
all_weights = data["data"]["weights"].value
print "weights"
print " > type :",type(all_weights)
print " > shape :",all_weights.shape
print all_weights
all_popmembers = data["data"]["popmembers"].value.T
print "popmembers"
print " > type :",type(all_popmembers)
print " > shape :",all_popmembers.shape
print all_popmembers
Ncells,Ne,Ni = 5000,4000,1000
Npop,Nmaxmembers = all_popmembers.shape
pmembership = .05
print( "\nsaving weights and groups to the file %s"%sys.argv[2] )
with open(sys.argv[2],"w") as fd:
# Saving some context
json.dump({
"Ncells" : Ncells,
"Ne" : Ne,
"Ni" : Ni,
"Npop" : Npop,
"pmembership" : pmembership,
"Nmaxmembers" : Nmaxmembers
},fd)
fd.write("\n")
with io.BytesIO() as bfd:
save(bfd,all_weights)
json.dump(bfd.tell(),fd) # saving data size
fd.write("\n")
fd.write(bfd.getvalue()) #saving the data
with io.BytesIO() as bfd:
save(bfd,all_popmembers)
json.dump(bfd.tell(),fd) # saving data size
fd.write("\n")
fd.write(bfd.getvalue()) #saving the data
一切正常,但以这种方式制作的数据文件大小却很大:
$$ python h5_to_BP.py trained.h5 trained.OU
reading weights and groups from the file trained.h5
weights
> type : <type 'numpy.ndarray'>
> shape : (5000, 5000)
[[ 0. 0. 0. ... 0. 48.68430328
0. ]
[ 0. 0. 0. ... 49.50580978 0.
0. ]
[ 1.81663287 1.80222368 0. ... 0. 0.
0. ]
...
[ 0. 0. 0. ... 0. 0.
0. ]
[ 1.27279222 0. 0. ... 0. 0.
16.22809982]
[ 0. 0. 0. ... 0. 0.
0. ]]
popmembers
> type : <type 'numpy.ndarray'>
> shape : (20, 300)
[[ 716 1866 3129 ... -1 -1 -1]
[1229 529 2725 ... -1 -1 -1]
[3971 1522 2328 ... -1 -1 -1]
...
[1161 46 3721 ... -1 -1 -1]
[1451 1712 3988 ... -1 -1 -1]
[3615 2657 3566 ... -1 -1 -1]]
saving weights and groups to the file trained.OU
$ ls -lh
-rw-rw-r-- 1 . . 18M Jun 23 2014 trained.h5
-rw-rw-r-- 1 . . 191M Jan 8 20:33 trained.OU
权重矩阵非常稀疏(几乎到处都是零),因此我更改代码以仅保存正数和索引:
from mrth import *
import h5py
import json,io
print( "\nreading weights and groups from the file %s"%sys.argv[1] )
data = h5py.File(sys.argv[1], 'r')
all_weights = data["data"]["weights"].value
print "weights"
print " > type :",type(all_weights)
print " > shape :",all_weights.shape
print all_weights
all_popmembers = data["data"]["popmembers"].value.T
print "popmembers"
print " > type :",type(all_popmembers)
print " > shape :",all_popmembers.shape
print all_popmembers
Ncells,Ne,Ni = 5000,4000,1000
Npop,Nmaxmembers = all_popmembers.shape
pmembership = .05
x,y = where( all_weights > 0. )
p_pawid = dstack( (x, y ) )[0]
x,y = where( all_popmembers >= 0. )
p_pmembers = dstack( (x, y ) )[0]
print( "\nsaving weights and groups to the file %s"%sys.argv[2] )
with open(sys.argv[2],"w") as fd:
json.dump({
"Ncells" : Ncells,
"Ne" : Ne,
"Ni" : Ni,
"Npop" : Npop,
"pmembership" : pmembership,
"Nmaxmembers" : Nmaxmembers
},fd)
fd.write("\n")
with io.BytesIO() as bfd:
save(bfd,p_pawid)
json.dump(bfd.tell(),fd)
fd.write("\n")
fd.write(bfd.getvalue())
with io.BytesIO() as bfd:
save(bfd,all_weights[p_pawid[:,0],p_pawid[:,1]])
json.dump(bfd.tell(),fd)
fd.write("\n")
fd.write(bfd.getvalue())
with io.BytesIO() as bfd:
save(bfd,p_pmembers)
json.dump(bfd.tell(),fd)
fd.write("\n")
fd.write(bfd.getvalue())
with io.BytesIO() as bfd:
save(bfd,all_popmembers[p_pmembers[:,0],p_pmembers[:,1]])
json.dump(bfd.tell(),fd)
fd.write("\n")
fd.write(bfd.getvalue())
大小下降了,但仍然很大:
$ python h5_to_BP.py trained.h5 trained.BP
reading weights and groups from the file trained.h5
weights
> type : <type 'numpy.ndarray'>
> shape : (5000, 5000)
[[ 0. 0. 0. ... 0. 48.68430328
0. ]
[ 0. 0. 0. ... 49.50580978 0.
0. ]
[ 1.81663287 1.80222368 0. ... 0. 0.
0. ]
...
[ 0. 0. 0. ... 0. 0.
0. ]
[ 1.27279222 0. 0. ... 0. 0.
16.22809982]
[ 0. 0. 0. ... 0. 0.
0. ]]
popmembers
> type : <type 'numpy.ndarray'>
> shape : (20, 300)
[[ 716 1866 3129 ... -1 -1 -1]
[1229 529 2725 ... -1 -1 -1]
[3971 1522 2328 ... -1 -1 -1]
...
[1161 46 3721 ... -1 -1 -1]
[1451 1712 3988 ... -1 -1 -1]
[3615 2657 3566 ... -1 -1 -1]]
saving weights and groups to the file trained.BP
$ls -lh
-rw-rw-r-- 1 . . 18M Jun 23 2014 trained.h5
-rw-rw-r-- 1 . . 191M Jan 8 20:33 trained.OU
-rw-rw-r-- 1 . . 115M Jan 8 20:27 trained.BP
有人可以解释这种现象吗?同一数据的大小会增加 10 !我猜是io.Bytes中的问题,对吗?