我正在尝试从图像中提取文本,但是对于手写图像,我无法获得正确的结果。另外,我想从图像中提取一些印地语文字。
我已使用tesseract库从图像中提取文本。它适用于数字文本,但不适用于手写文本。
输入图片
输出图像
def get_string(path):
#read image with openCV
img = cv2.imread(path)
#convert image to grayscale
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Define config parameters.
# '-l eng' for using the English language
# '--oem 1' for using LSTM OCR Engine
config = ('-l eng --oem 1 --psm 3')
#apply dilation and erosion to remove some noise
kernel = np.ones((1,1), np.uint8)
img = cv2.dilate(img, kernel, iterations = 1)
img = cv2.erode(img, kernel, iterations = 1)
#write the image after performing preprocessing
cv2.imwrite("threshold.png", img)
#extracting text from image using tesseract
cv2.imshow("threshold.png", img)
cv2.waitKey(4000)
result = pytesseract.image_to_string(Image.open("threshold.png"), config = config)
#writing the extracted data to file
f = open("sample.txt","w+")
f.write(result)
f.close()