我有一个将224 * 224代码转换为MNIST样式压缩文件的代码。我从https://github.com/gskielian/JPG-PNG-to-MNIST-NN-Format借来了代码。
代码在我的本地计算机上运行良好。当我在google colab上运行它时,出现错误。直到上周为止,它在colab上都运行良好,但是不知何故,它不再在colab上运行。
仅提取标签有效。这意味着google colab在读取带有标签的zip文件时没有问题。该错误仅与提取图像的额外数据有关。
import os
from PIL import Image
from array import *
from random import Random
from os.path import isfile
import hashlib
# Load from and save to
Names = [['test-images','test']]
for name in Names:
print(name)
data_image = array('B')
data_label = array('B')
FileList = []
for f in os.listdir(name[0]):
if not isfile(name[0]+"/"+f):
for g in os.listdir(name[0]+"/"+f):
if ((g.endswith("png")) or (g.endswith("jpg"))):
FileList.append(os.path.join(name[0],f,g))
FileList.sort()
#shuffle(FileList) # Useful for further segmenting the validation set
# Random(4).shuffle(FileList)
for filename in FileList:
Stringlabel = filename.split('/')[2].split('.')[0]
#label = int(hashlib.sha256(Stringlabel.encode('utf-8')).hexdigest(), 16) % 10**2
label = int(filename.split('/')[1])
print(Stringlabel, label)
Im = Image.open(filename)
pixel = Im.load()
width, height = Im.size
for x in range(0,width):
for y in range(0,height):
data_image.append(pixel[y,x])
data_label.append(label) # labels start (one unsigned byte each)
hexval = "{0:#0{1}x}".format(len(FileList),10) # number of files in HEX
# header for label array
header = array('B')
header.extend([0,0,8,1])
header.append(int('0x'+hexval[2:][:2],16))
header.append(int('0x'+hexval[4:][:2],16))
header.append(int('0x'+hexval[6:][:2],16))
header.append(int('0x'+hexval[8:][:2],16))
data_label = header + data_label
# additional header for images array
if max([width,height]) <= 256:
header.extend([0,0,0,width,0,0,0,height])
else:
raise ValueError('Image exceeds maximum size: 256x256 pixels');
header[3] = 3 # Changing MSB for image data (0x00000803)
data_image = header + data_image
output_file = open(name[1]+'-images-idx3-ubyte', 'wb')
data_image.tofile(output_file)
output_file.close()
output_file = open(name[1]+'-labels-idx1-ubyte', 'wb')
data_label.tofile(output_file)
output_file.close()
# gzip resulting files
for name in Names:
os.system('gzip '+name[1]+'-images-idx3-ubyte')
os.system('gzip '+name[1]+'-labels-idx1-ubyte')
我的读取zip文件的代码如下:
def extract_data(filename, num_images):
with gzip.open(filename) as bytestream:
bytestream.read(16)
buf = bytestream.read(224 * 224 * num_images)
data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
data = data.reshape(num_images, 224,224)
return data
def extract_labels(filename, num_images):
with gzip.open(filename) as bytestream:
bytestream.read(8)
buf = bytestream.read(1 * num_images)
labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64)
return labels
我使用以下代码读取文件并将其存储在变量中:
test_data = extract_data('test-images-idx3-ubyte.gz', 252)
test_labels = extract_labels('test-labels-idx1-ubyte.gz',252)
我得到的错误如下:
IOErrorTraceback (most recent call last)
<ipython-input-12-4aaaf75c94fc> in <module>()
----> 1 test_data = extract_data('test-images-idx3-ubyte.gz', 252)
2 test_labels = extract_labels('test-labels-idx1-ubyte.gz',252)
3 test_labels[1]
<ipython-input-10-4798d2710754> in extract_data(filename, num_images)
2 # with open(filename, "rb") as bytestream:
3 with gzip.open(filename) as bytestream:
----> 4 bytestream.read(16)
5 buf = bytestream.read(224 * 224 * num_images)
6 data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
/usr/lib/python2.7/gzip.pyc in read(self, size)
266 try:
267 while size > self.extrasize:
--> 268 self._read(readsize)
269 readsize = min(self.max_read_chunk, readsize * 2)
270 except EOFError:
/usr/lib/python2.7/gzip.pyc in _read(self, size)
301
302 self._init_read()
--> 303 self._read_gzip_header()
304 self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
305 self._new_member = False
/usr/lib/python2.7/gzip.pyc in _read_gzip_header(self)
195 magic = self.fileobj.read(2)
196 if magic != '\037\213':
--> 197 raise IOError, 'Not a gzipped file'
198 method = ord( self.fileobj.read(1) )
199 if method != 8:
IOError: Not a gzipped file