我正在使用SVM进行手写字符分类。我准备了自己的字符数据集(图像)。至于现在从a到f。每个文件夹(a到f)都有大约20到22张图像。因此,总共约有188张图像。
在将图像添加到numpy数组并在测试期间进行预测时遇到问题。
下面提供了所有源代码。
import cv2
import numpy as np
import glob
from sklearn import svm
from sklearn.externals import joblib
from sklearn import model_selection
#constant values
image_location = "/home/syedjafer/Documents/Handwriting_recognition_svm/test1/images/"
folder_depth = 7
image_x = []
image_y = []
files = []
#to get all files in images folder
for level in range(folder_depth):
folder = chr(ord("b")+level)
print (image_location+folder+"/")
label_len = len(glob.glob(image_location+folder+"/"+"*.png"))
files = files + glob.glob(image_location+folder+"/"+"*.png")
print(files)
image_y = image_y + [folder] * (label_len)
print(image_y)
['b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'd', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'e', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'f', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'g', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h', 'h']
# adding images to numpy arrays
for file in files:
image = cv2.imread(file,0) # 0 for grayscale 2D array
image_x.append(image.flatten().tolist())
print(len(image_x))
166
# decalring test size and seed
test_size = 0.33
seed = 7
X = np.array(image_x , dtype="uint8")
image_y = np.array(image_y)
x_train , x_test , y_train , y_test = model_selection.train_test_split(image_x, image_y, test_size=test_size, random_state=seed)
clf = svm.SVC(gamma=0.0001, C=100)
clf.fit(x_train, y_train)
SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
decision_function_shape='ovr', degree=3, gamma=0.0001, kernel='rbf',
max_iter=-1, probability=False, random_state=None, shrinking=True,
tol=0.001, verbose=False)
print(y_train)
['f' 'h' 'b' 'g' 'f' 'h' 'f' 'h' 'e' 'g' 'e' 'b' 'h' 'e' 'c' 'b' 'c' 'g'
'e' 'd' 'h' 'c' 'f' 'd' 'f' 'd' 'b' 'd' 'b' 'b' 'h' 'e' 'b' 'b' 'b' 'c'
'd' 'h' 'f' 'g' 'b' 'd' 'g' 'b' 'c' 'e' 'g' 'e' 'f' 'd' 'c' 'c' 'h' 'g'
'c' 'h' 'f' 'f' 'h' 'f' 'g' 'f' 'b' 'f' 'g' 'g' 'g' 'b' 'd' 'd' 'c' 'f'
'b' 'h' 'e' 'h' 'f' 'g' 'b' 'e' 'h' 'e' 'g' 'b' 'c' 'h' 'c' 'f' 'd' 'e'
'c' 'b' 'b' 'd' 'e' 'b' 'g' 'g' 'e' 'g' 'c' 'f' 'e' 'e' 'b' 'g' 'f' 'f'
'h' 'd' 'c']
for xx , yy in zip(x_test,y_test):
print(clf.predict(np.array(xx).reshape(1,-1)),"=>",yy,"\n",)
['b'] => d
['b'] => f
['b'] => g
['b'] => c
['b'] => d
['b'] => e
['b'] => e
['b'] => h
['b'] => h
['f'] => f
['b'] => h
['b'] => g
['b'] => h
['b'] => c
['b'] => h
['b'] => g
['b'] => b
['b'] => f
['b'] => g
['b'] => f
['b'] => e
['b'] => c
['b'] => f
['b'] => g
['b'] => c
['b'] => f
['b'] => b
['b'] => f
['b'] => e
['b'] => e
['b'] => e
['b'] => g
['g'] => g
['b'] => h
['b'] => c
['b'] => d
['f'] => f
['b'] => d
['b'] => f
['b'] => g
['b'] => d
['b'] => c
['b'] => c
['b'] => d
['b'] => h
['b'] => g
['b'] => b
['b'] => h
['b'] => c
['b'] => e
['b'] => d
['b'] => c
['b'] => d
['b'] => c
['b'] => b
将图像添加到numpy数组的步骤
训练分类器
clf = svm.SVC(gamma=0.0001, C=100)
clf.fit(x_train, y_train)
在测试阶段,它会产生错误。那是第一组中的类标签,它已经被更多地预测了。 (例如,第一个集合是“ a”,然后是“ b”和其他集合。)
我不知道错误在哪里发生。
我正在使用的数据集图像的scrnsht, dataset images