创建PICKLE时的百分比过高(Python)

时间:2017-10-04 14:17:07

标签: python pickle

以下代码在执行时发生了一件奇怪的事情:显示completition的百分比超过100%。代码现在运行了大约45分钟..

这是我在错误original one给我之后适应Python 3的代码。

'''
Read and pre-process SD19 characters text file.
Blog post : http://seeb0h.github.io/howto/preprocess-sd19-dataset-for-digits-learning/
Characters in txt file are in 128x128 images with much padded zeros.
It may be suitable for learning to have smaller, deskewed, trimmed, squared ones
Following preprocessing is applied to the dataset:
 - Read glyph (see read_glyph())
 - Moment-based image deskew (see deskew())
 - Trim zeros rows and columns (see trim_padding())
 - Resize image while keeping aspect ratio (see resize_with_constant_ratio())
 - Pad zeros in order to get a square image (see pad_digit())
Extends original code from http://asciirain.com/wordpress/2013/04/08/exploring-sd19-glyph-recognition-with-randomforests/
Usage:
   preprocess_sd19_text.py

'''
#

import os
import re
import sys
import pickle
import cv2
import numpy as np
import math

def read_glyph(_line):
    """Extract digit from the text file
    Parameters
    ----------
    _line : string
        current line in SD19 text file
    Returns
    -------
    digit : np.array
        2D digit 128x128
    label : int
        the label
    """
    match = re.search("^(\S+) (\d+)", _line)
    label = match.group(1)
    vector = list(match.group(2))
    vector = [int(x) for x in vector]

    label = ord(label)
    label = str(symbol_map[label]) #changed from int to str

    digit = np.array(vector, 'float32')
    digit = (digit*-1.+1.).reshape(128, 128)

    return digit, label

def deskew(img):
    """Deskew digit
    Parameters
    ----------
    img : np.array
        2D digit array
    Returns
    -------
    dst : Deskewed digit
    """
    m = cv2.moments(img)
    if abs(m['mu02']) < 1e-2:
        return img.copy()
    skew = m['mu11']/m['mu02']
    rot_mat = np.float32([[1, skew, -0.5*max(img.shape[0], img.shape[1])*skew], [0, 1, 0]])
    img = cv2.warpAffine(img, rot_mat, (img.shape[0], img.shape[1]), flags=cv2.WARP_INVERSE_MAP | cv2.INTER_LINEAR)
    return img


def resize_with_constant_ratio(img, char_dim):
    """Resize image while keeping aspect ratio. Max dim is char_dim
    pad_dim is applied in order to have derivative friendly image
    Parameters
    ----------
    img : np.array
        2D digit array
    char_dim : int
        dst dim
    Returns
    -------
    dst : resized digit
    """
    roi_h = img.shape[0]
    roi_w = img.shape[1]

    max_dim = max(roi_w, roi_h)
    pad_dim = 2
    scale = float(char_dim-pad_dim) / max_dim
    if roi_w >= roi_h:
        new_w = int(char_dim-pad_dim)
        new_h = int(roi_h * scale)
    else:
        new_w = int(roi_w * scale)
        new_h = int(char_dim-pad_dim)

    dst = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)

    return dst

def trim_padding(img):
    """Trim zeros rows and columns
    Parameters
    ----------
    img : np.array
        2D digit array
    Returns
    -------
    dst : trimmed digit
    """
    mask_row = np.all(np.equal(img, 0), axis=1)
    dst = img[~mask_row]

    mask_col = np.all(np.equal(dst, 0), axis=0)
    dst = dst[:, ~mask_col]

    return dst

def pad_digit(img, char_dim):
    """Pad zeros in order to get a square char_dimxchar_dim image
    Parameters
    ----------
    img : np.array
        2D digit array
    char_dim : int
        image dim
    Returns
    -------
    dst : padded digit
    """
    pad_h = char_dim-img.shape[0]
    pad_w = char_dim-img.shape[1]
    pad_h_b = math.floor(pad_h/2)
    pad_h_t = pad_h - pad_h_b
    pad_w_r = math.floor(pad_w/2)
    pad_w_l = pad_w - pad_w_r

    dst = np.hstack(( img, np.zeros((img.shape[0], pad_w_r))))
    dst = np.hstack(( np.zeros((dst.shape[0], pad_w_l)), dst))

    dst = np.vstack(( dst, np.zeros((pad_h_b, dst.shape[1]))))
    dst = np.vstack(( np.zeros((pad_h_t, dst.shape[1])), dst))

    return dst


def print_overwrite(text):
    """Print with overwrite (for progression counter)
    Parameters
    ----------
    text : string
          text to display
    """
    delete = "\b" * (len (text)+1)
    print ("{0}{1}".format(delete, text)),

if __name__ == '__main__':
    print (__doc__)

    sd19_filename = "sd19-binary_digits.txt"
    data = open(sd19_filename, "r")
    dataset = []
    symbol_map = dict([(x, chr(x)) for x in list(range(48, 58)) + list(range(65, 91)) + list(range(97, 123))])  #added list() to every range

    current_dir = os.curdir

    num_records = 0
    num_lines = 402953

    char_dim=28
    pickle_name = "SD19_" + str(char_dim) + "x" + str(char_dim) + "_"


    for line in data:
        num_records += 1

        if num_records%20000 == 0:
            with open(os.path.join(current_dir, pickle_name +\
                    str(num_records) + ".pickle"), 'wb') as f:
                pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
        print_overwrite("num_records : {}/{} - {:5.2f}%"\
            .format(num_records, num_lines, num_records*1./num_lines*100))

        digit, label = read_glyph(line)
        digit_deskewed = deskew(digit)
        digit_trimmed = trim_padding(digit_deskewed)
        digit_resized = resize_with_constant_ratio(digit_trimmed, char_dim)
        digit_padded = pad_digit(digit_resized, char_dim)

        item = []
        item.append((digit_padded*255).astype('uint8'))
        item.append(label)
        dataset.append(item)


    with open(os.path.join(current_dir, pickle_name +\
                str(num_lines) + ".pickle"), 'wb') as f:
        pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)

它用于从包含二进制图像的.txt文件创建PICKLE。有关详情,请参阅here

我的“错误”......

files

到目前为止,它是135%,最后一个PICKLE文件大约是400mb ...

为什么会这样?此外,它继续创建文件(应该已停止在400.000或更多)。

0 个答案:

没有答案