我试图以与mnist.pkl相同的格式制作图像数据集
我使用https://github.com/dmitriy-serdyuk/cats_vs_dogs/blob/master/cats_vs_dogs/make_dataset.py作为参考。
这是我到目前为止所拥有的
path = '/home/dell/thesis/neon/Images'
def PIL2array(img):
return numpy.array(img.getdata(),
numpy.uint8).reshape(img.size[1], img.size[0], 1)
def main():
fileList = [os.path.join(dirpath, f)
for dirpath, dirnames, files in os.walk(path)
for f in files if f.endswith('.jpg')]
print "Preparing your pickle files. Pls wait..."
t0 = time.time()
for file_ in fileList:
print file_
img = Image.open(file_)
arr = PIL2array(img)
cPickle.dump(arr,open(file_+"-prot0"+".pkl","wb"),protocol=0)
t1=time.time()
total = t1-t0
print "P(h)ickling execution time: %.2f sec" % total
# routine to recursively traverse a folder and save list of file names
pklList = [os.path.join(dirpath, f)
for dirpath, dirnames, files in os.walk(path)
for f in files if f.endswith('.pkl')]
#print "hi"
all_files = []
for file_ in pklList:
all_files += [file_]
train_share = 0.6
valid_share = 0.2
seed = 1
n_train = int(len(all_files) * train_share)
n_valid = int(len(all_files) * valid_share)
rng = np.random.RandomState(seed)
rng.shuffle(all_files)
train = all_files[:n_train]
valid = all_files[n_train:(n_train + n_valid)]
test = all_files[(n_train + n_valid):]
save_path = os.path.join(dirpath, '../datasets.pkl')
with open(save_path, 'w') as fout:
cPickle.dump((train, valid, test), fout)
filters = tables.Filters(complib='blosc', complevel=5)
hdf5_file = 'dataset.h5'
full_path = os.path.join(dirpath, hdf5_file)
h5file = tables.open_file(full_path, mode='w',
title='pics',
filters=filters)
save_path = os.path.join(dirpath, '../datasets.pkl')
with open(save_path, 'r') as fin:
files = cPickle.load(fin)
**for subfiles, subset in zip(files, ['train', 'valid', 'test']):
group = h5file.create_group(h5file.root, subset, subset)
X = h5file.create_vlarray(group, 'X', atom=tables.UInt8Atom(),
title='Data values',
expectedrows=len(subfiles), f filters=filters)
y = h5file.create_carray(group, 'y', atom=tables.UInt8Atom(),
title='Data targets',
shape=(len(subfiles),), filters=filters)
s = h5file.create_carray(group, 's', atom=tables.UInt32Atom(),
title='Data shapes',
shape=(len(subfiles), 3), filters=filters)**
for i, file in enumerate(subfiles):
full_path = os.path.join(dirpath, file)
with open(full_path, 'r') as fin:
image, label = cPickle.load(fin)
X.append(image.flatten())
y[i] = label
s[i] = np.array(image.shape)
if i % 50 == 0:
print '.. aggregated', i, 'from', subset
h5file.flush()
h5file.flush()
我突出了我面临问题的部分。 我一直收到以下错误:
主要
image,label = cPickle.load(fin)
ValueError:要解压缩的值太多
有人可以帮助我吗?