这是我的代码:
n = 100000 #This is what makes it tricky - lots of files going into this hdf5 file
with h5py.File('image1.h5','w') as f:
dset_X = f.create_dataset('X',(1,960,224,224),maxshape=(None,960,224,224),chunks=True,compression='gzip')
dset_y = f.create_dataset('y',(1,112,224*224),maxshape=(None,112,224*224),chunks=True,compression='gzip')
n_images = 0
for fl in files[:n]:
X_chunk,y_chunk = get_arrays(fl)
dset_X.resize(n_images+1,axis=0)
dset_y.resize(n_images+1,axis=0)
print dset_X.shape,dset_y.shape
dset_X[n_images:n_images+1,:,:,:]=X_chunk
dset_y[n_images:n_images+1,:,:]=y_chunk
n_images+=1
这很好用,花花公子。但是,对于1个文件,hdf5的大小为6.7MB。有2个文件,它的37MB(应该是12 MB吗?)。 10个一直到388MB(应该是67对吗?)
因此,明确地将压缩标志添加到第2行和第3行的末尾并不是按预期工作的。我怎样才能实现这样的目标?
答案 0 :(得分:0)
我最终使用pytables成功完成了这项工作。
def get_arrays(each_file):
lab = color.rgb2lab(io.imread(each_file))
X = lab[:,:,:1]
y = lab[:,:,1:]
X_rows,X_columns,X_channels=X.shape
y_rows,y_columns,y_channels=y.shape
X_channels_first = np.transpose(X,(2,0,1))
X_sample = np.expand_dims(X_channels_first,axis=0)
X_3d = np.tile(X_sample,(1,3,1,1))
X_3d_scaled = X_3d * 255.0/X_3d.max()
hc = extract_hypercolumn(model,[3,8,15,22],X_3d_scaled)
hc_scaled = (hc -hc.min())/(hc.max()-hc.min())
print hc_scaled.max(),hc_scaled.min()
hc_expand_dims = np.expand_dims(hc_scaled,axis=0)
y_reshaped = np.reshape(y,(y_rows*y_columns,y_channels))
classed_pixels_first = KNN.predict_proba(y_reshaped)
classed_classes_first = np.transpose(classed_pixels_first,(1,0))
classed_expand_dims = np.expand_dims(classed_classes_first,axis=0)
print "hypercolumn shape: ",hc_expand_dims.shape,"classified output color shape: ",classed_expand_dims.shape
return hc_expand_dims,classed_expand_dims
filters = tables.Filters(complevel=5, complib='zlib')
with tables.openFile('raw.h5','w') as f:
# filters = tables.Filters(complib='blosc', complevel=5)
dset_X = f.create_earray(f.root, 'X', tables.Atom.from_dtype(np.dtype('Float64')), (0,960,224,224),filters=filters)
dset_y = f.create_earray(f.root, 'y', tables.Atom.from_dtype(np.dtype('Float64')), (0,112,224*224),filters=filters)
for fl in files[0:12000]:
X_chunk,y_chunk=get_arrays(fl)
dset_X.append(X_chunk)
dset_y.append(y_chunk)