以下代码在执行时发生了一件奇怪的事情:显示completition的百分比超过100%。代码现在运行了大约45分钟..
这是我在错误original one给我之后适应Python 3的代码。
'''
Read and pre-process SD19 characters text file.
Blog post : http://seeb0h.github.io/howto/preprocess-sd19-dataset-for-digits-learning/
Characters in txt file are in 128x128 images with much padded zeros.
It may be suitable for learning to have smaller, deskewed, trimmed, squared ones
Following preprocessing is applied to the dataset:
- Read glyph (see read_glyph())
- Moment-based image deskew (see deskew())
- Trim zeros rows and columns (see trim_padding())
- Resize image while keeping aspect ratio (see resize_with_constant_ratio())
- Pad zeros in order to get a square image (see pad_digit())
Extends original code from http://asciirain.com/wordpress/2013/04/08/exploring-sd19-glyph-recognition-with-randomforests/
Usage:
preprocess_sd19_text.py
'''
#
import os
import re
import sys
import pickle
import cv2
import numpy as np
import math
def read_glyph(_line):
"""Extract digit from the text file
Parameters
----------
_line : string
current line in SD19 text file
Returns
-------
digit : np.array
2D digit 128x128
label : int
the label
"""
match = re.search("^(\S+) (\d+)", _line)
label = match.group(1)
vector = list(match.group(2))
vector = [int(x) for x in vector]
label = ord(label)
label = str(symbol_map[label]) #changed from int to str
digit = np.array(vector, 'float32')
digit = (digit*-1.+1.).reshape(128, 128)
return digit, label
def deskew(img):
"""Deskew digit
Parameters
----------
img : np.array
2D digit array
Returns
-------
dst : Deskewed digit
"""
m = cv2.moments(img)
if abs(m['mu02']) < 1e-2:
return img.copy()
skew = m['mu11']/m['mu02']
rot_mat = np.float32([[1, skew, -0.5*max(img.shape[0], img.shape[1])*skew], [0, 1, 0]])
img = cv2.warpAffine(img, rot_mat, (img.shape[0], img.shape[1]), flags=cv2.WARP_INVERSE_MAP | cv2.INTER_LINEAR)
return img
def resize_with_constant_ratio(img, char_dim):
"""Resize image while keeping aspect ratio. Max dim is char_dim
pad_dim is applied in order to have derivative friendly image
Parameters
----------
img : np.array
2D digit array
char_dim : int
dst dim
Returns
-------
dst : resized digit
"""
roi_h = img.shape[0]
roi_w = img.shape[1]
max_dim = max(roi_w, roi_h)
pad_dim = 2
scale = float(char_dim-pad_dim) / max_dim
if roi_w >= roi_h:
new_w = int(char_dim-pad_dim)
new_h = int(roi_h * scale)
else:
new_w = int(roi_w * scale)
new_h = int(char_dim-pad_dim)
dst = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_LINEAR)
return dst
def trim_padding(img):
"""Trim zeros rows and columns
Parameters
----------
img : np.array
2D digit array
Returns
-------
dst : trimmed digit
"""
mask_row = np.all(np.equal(img, 0), axis=1)
dst = img[~mask_row]
mask_col = np.all(np.equal(dst, 0), axis=0)
dst = dst[:, ~mask_col]
return dst
def pad_digit(img, char_dim):
"""Pad zeros in order to get a square char_dimxchar_dim image
Parameters
----------
img : np.array
2D digit array
char_dim : int
image dim
Returns
-------
dst : padded digit
"""
pad_h = char_dim-img.shape[0]
pad_w = char_dim-img.shape[1]
pad_h_b = math.floor(pad_h/2)
pad_h_t = pad_h - pad_h_b
pad_w_r = math.floor(pad_w/2)
pad_w_l = pad_w - pad_w_r
dst = np.hstack(( img, np.zeros((img.shape[0], pad_w_r))))
dst = np.hstack(( np.zeros((dst.shape[0], pad_w_l)), dst))
dst = np.vstack(( dst, np.zeros((pad_h_b, dst.shape[1]))))
dst = np.vstack(( np.zeros((pad_h_t, dst.shape[1])), dst))
return dst
def print_overwrite(text):
"""Print with overwrite (for progression counter)
Parameters
----------
text : string
text to display
"""
delete = "\b" * (len (text)+1)
print ("{0}{1}".format(delete, text)),
if __name__ == '__main__':
print (__doc__)
sd19_filename = "sd19-binary_digits.txt"
data = open(sd19_filename, "r")
dataset = []
symbol_map = dict([(x, chr(x)) for x in list(range(48, 58)) + list(range(65, 91)) + list(range(97, 123))]) #added list() to every range
current_dir = os.curdir
num_records = 0
num_lines = 402953
char_dim=28
pickle_name = "SD19_" + str(char_dim) + "x" + str(char_dim) + "_"
for line in data:
num_records += 1
if num_records%20000 == 0:
with open(os.path.join(current_dir, pickle_name +\
str(num_records) + ".pickle"), 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
print_overwrite("num_records : {}/{} - {:5.2f}%"\
.format(num_records, num_lines, num_records*1./num_lines*100))
digit, label = read_glyph(line)
digit_deskewed = deskew(digit)
digit_trimmed = trim_padding(digit_deskewed)
digit_resized = resize_with_constant_ratio(digit_trimmed, char_dim)
digit_padded = pad_digit(digit_resized, char_dim)
item = []
item.append((digit_padded*255).astype('uint8'))
item.append(label)
dataset.append(item)
with open(os.path.join(current_dir, pickle_name +\
str(num_lines) + ".pickle"), 'wb') as f:
pickle.dump(dataset, f, pickle.HIGHEST_PROTOCOL)
它用于从包含二进制图像的.txt文件创建PICKLE。有关详情,请参阅here。
我的“错误”......
到目前为止,它是135%,最后一个PICKLE文件大约是400mb ...
为什么会这样?此外,它继续创建文件(应该已停止在400.000或更多)。