调整大小时如何压缩hdf5文件?

时间:2016-07-02 00:39:09

标签: python hdf5 h5py

这是我的代码:

n = 100000       #This is what makes it tricky - lots of files going into this hdf5 file

with h5py.File('image1.h5','w') as f:
    dset_X = f.create_dataset('X',(1,960,224,224),maxshape=(None,960,224,224),chunks=True,compression='gzip')
    dset_y = f.create_dataset('y',(1,112,224*224),maxshape=(None,112,224*224),chunks=True,compression='gzip')
    n_images = 0
    for fl in files[:n]:
        X_chunk,y_chunk = get_arrays(fl)
        dset_X.resize(n_images+1,axis=0)
        dset_y.resize(n_images+1,axis=0)
        print dset_X.shape,dset_y.shape
        dset_X[n_images:n_images+1,:,:,:]=X_chunk
        dset_y[n_images:n_images+1,:,:]=y_chunk
        n_images+=1

这很好用,花花公子。但是,对于1个文件,hdf5的大小为6.7MB。有2个文件,它的37MB(应该是12 MB吗?)。 10个一直到388MB(应该是67对吗?)

因此,明确地将压缩标志添加到第2行和第3行的末尾并不是按预期工作的。我怎样才能实现这样的目标?

1 个答案:

答案 0 :(得分:0)

我最终使用pytables成功完成了这项工作。

def get_arrays(each_file):
    lab = color.rgb2lab(io.imread(each_file))
    X = lab[:,:,:1]
    y = lab[:,:,1:]
    X_rows,X_columns,X_channels=X.shape
    y_rows,y_columns,y_channels=y.shape
    X_channels_first = np.transpose(X,(2,0,1))
    X_sample = np.expand_dims(X_channels_first,axis=0)
    X_3d = np.tile(X_sample,(1,3,1,1))
    X_3d_scaled = X_3d * 255.0/X_3d.max()
    hc = extract_hypercolumn(model,[3,8,15,22],X_3d_scaled)
    hc_scaled = (hc -hc.min())/(hc.max()-hc.min())
    print hc_scaled.max(),hc_scaled.min()
    hc_expand_dims = np.expand_dims(hc_scaled,axis=0)
    y_reshaped = np.reshape(y,(y_rows*y_columns,y_channels))
    classed_pixels_first = KNN.predict_proba(y_reshaped)
    classed_classes_first = np.transpose(classed_pixels_first,(1,0))
    classed_expand_dims = np.expand_dims(classed_classes_first,axis=0)
    print "hypercolumn shape: ",hc_expand_dims.shape,"classified output color shape: ",classed_expand_dims.shape
    return hc_expand_dims,classed_expand_dims 

filters = tables.Filters(complevel=5, complib='zlib')

with tables.openFile('raw.h5','w') as f:
#    filters = tables.Filters(complib='blosc', complevel=5)
    dset_X = f.create_earray(f.root, 'X', tables.Atom.from_dtype(np.dtype('Float64')), (0,960,224,224),filters=filters)
    dset_y = f.create_earray(f.root, 'y', tables.Atom.from_dtype(np.dtype('Float64')), (0,112,224*224),filters=filters)
    for fl in files[0:12000]:
        X_chunk,y_chunk=get_arrays(fl)
        dset_X.append(X_chunk)
        dset_y.append(y_chunk)