Question

我有一个将224 * 224代码转换为MNIST样式压缩文件的代码。我从https://github.com/gskielian/JPG-PNG-to-MNIST-NN-Format借来了代码。

代码在我的本地计算机上运行良好。当我在google colab上运行它时，出现错误。直到上周为止，它在colab上都运行良好，但是不知何故，它不再在colab上运行。

仅提取标签有效。这意味着google colab在读取带有标签的zip文件时没有问题。该错误仅与提取图像的额外数据有关。

import os
from PIL import Image
from array import *
from random import Random
from os.path import isfile
import hashlib

# Load from and save to
Names = [['test-images','test']]

for name in Names:
    print(name)
    data_image = array('B')
    data_label = array('B')

    FileList = []   
    for f in os.listdir(name[0]):
        if not isfile(name[0]+"/"+f):
            for g in os.listdir(name[0]+"/"+f):
                if ((g.endswith("png")) or (g.endswith("jpg"))):
                    FileList.append(os.path.join(name[0],f,g))


    FileList.sort()
    #shuffle(FileList) # Useful for further segmenting the validation set
    # Random(4).shuffle(FileList)

    for filename in FileList:

        Stringlabel = filename.split('/')[2].split('.')[0]
        #label = int(hashlib.sha256(Stringlabel.encode('utf-8')).hexdigest(), 16) % 10**2
        label = int(filename.split('/')[1])
        print(Stringlabel, label)
        Im = Image.open(filename)

        pixel = Im.load()

        width, height = Im.size

        for x in range(0,width):
            for y in range(0,height):
                data_image.append(pixel[y,x])

        data_label.append(label) # labels start (one unsigned byte each)

    hexval = "{0:#0{1}x}".format(len(FileList),10) # number of files in HEX

    # header for label array

    header = array('B')
    header.extend([0,0,8,1])
    header.append(int('0x'+hexval[2:][:2],16))
    header.append(int('0x'+hexval[4:][:2],16))
    header.append(int('0x'+hexval[6:][:2],16))
    header.append(int('0x'+hexval[8:][:2],16))

    data_label = header + data_label

    # additional header for images array

    if max([width,height]) <= 256:
        header.extend([0,0,0,width,0,0,0,height])
    else:
        raise ValueError('Image exceeds maximum size: 256x256 pixels');

    header[3] = 3 # Changing MSB for image data (0x00000803)

    data_image = header + data_image

    output_file = open(name[1]+'-images-idx3-ubyte', 'wb')
    data_image.tofile(output_file)
    output_file.close()

    output_file = open(name[1]+'-labels-idx1-ubyte', 'wb')
    data_label.tofile(output_file)
    output_file.close()

# gzip resulting files

for name in Names:
    os.system('gzip '+name[1]+'-images-idx3-ubyte')
    os.system('gzip '+name[1]+'-labels-idx1-ubyte')

我的读取zip文件的代码如下：

def extract_data(filename, num_images):
    with gzip.open(filename) as bytestream:
        bytestream.read(16)
        buf = bytestream.read(224 * 224 * num_images)
        data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)
        data = data.reshape(num_images, 224,224)
        return data

def extract_labels(filename, num_images):
    with gzip.open(filename) as bytestream:
        bytestream.read(8)
        buf = bytestream.read(1 * num_images)
        labels = np.frombuffer(buf, dtype=np.uint8).astype(np.int64)
        return labels

我使用以下代码读取文件并将其存储在变量中：

test_data = extract_data('test-images-idx3-ubyte.gz', 252)
test_labels = extract_labels('test-labels-idx1-ubyte.gz',252)

我得到的错误如下：

IOErrorTraceback (most recent call last)

<ipython-input-12-4aaaf75c94fc> in <module>()
----> 1 test_data = extract_data('test-images-idx3-ubyte.gz', 252)
      2 test_labels = extract_labels('test-labels-idx1-ubyte.gz',252)
      3 test_labels[1]

<ipython-input-10-4798d2710754> in extract_data(filename, num_images)
      2 #     with open(filename, "rb") as bytestream:
      3     with gzip.open(filename) as bytestream:
----> 4         bytestream.read(16)
      5         buf = bytestream.read(224 * 224 * num_images)
      6         data = np.frombuffer(buf, dtype=np.uint8).astype(np.float32)

/usr/lib/python2.7/gzip.pyc in read(self, size)
    266             try:
    267                 while size > self.extrasize:
--> 268                     self._read(readsize)
    269                     readsize = min(self.max_read_chunk, readsize * 2)
    270             except EOFError:

/usr/lib/python2.7/gzip.pyc in _read(self, size)
    301 
    302             self._init_read()
--> 303             self._read_gzip_header()
    304             self.decompress = zlib.decompressobj(-zlib.MAX_WBITS)
    305             self._new_member = False

/usr/lib/python2.7/gzip.pyc in _read_gzip_header(self)
    195         magic = self.fileobj.read(2)
    196         if magic != '\037\213':
--> 197             raise IOError, 'Not a gzipped file'
    198         method = ord( self.fileobj.read(1) )
    199         if method != 8:

IOError: Not a gzipped file

在gooble colaboratory中读取gzip文件时出现问题，但在我的本地计算机上运行正常

0 个答案: