我想从下图中连同字段一起提取 Passport number, Name, Nationality, date of birth
之类的信息:
我能够提取 Name, Passport number, date of birth
,但无法获取属性(红色下划线),因为它非常模糊。
我正在使用以下方法:
def data_extraction_with_cleaning_v2(path, file_name, threshold, preprocess_resize, filtering):
"""This function will do data extraction along with image preprocessing"""
image = cv2.imread(path+file_name)
##change the image to gray scale
gray = cv2.cvtColor(result, cv2.COLOR_BGR2GRAY)
cv2.imshow("Output", gray)
cv2.waitKey(0)
# check to see if we should apply thresholding to preprocess the
# image
if threshold == "thresh":
gray = cv2.threshold(gray, 0, 255,
cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
elif threshold == "adaptive":
gray = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
cv2.THRESH_BINARY, 31, 2)
if preprocess_resize == "linear":
gray = cv2.resize(gray, None, fx=2, fy=2,
interpolation=cv2.INTER_LINEAR)
elif preprocess_resize == "cubic":
gray = cv2.resize(gray, None, fx=2, fy=2,
interpolation=cv2.INTER_CUBIC)
# make a check to see if blurring should be done to remove noise, first is
# default median blurring
if filtering == "blur":
gray = cv2.medianBlur(gray, 3)
elif filtering == "bilateral":
gray = cv2.bilateralFilter(gray, 9, 75, 75)
elif filtering == "gauss":
gray = cv2.GaussianBlur(gray, (5, 5), 0)
else:
pass
# write the grayscale image to disk as a temporary file so we can
# apply OCR to it
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray)
text = pytesseract.image_to_string(Image.open(filename), lang='eng')
text = re.sub(r'[^\da-zA-Z0-9_() \n]+', '', text)
# text = text.replace('\n', ' ')
# add +hin after eng within the same argument to extract hindi specific
# text - change encoding to utf-8 while writing
os.remove(filename)
# writing extracted data into a text file
text_output = open('outputbase1.txt', 'w', encoding='utf-8')
text_output.write(text)
text_output.close()
file = open('outputbase1.txt', 'r', encoding='utf-8')
text = file.read()
# print(text)
# Cleaning all the gibberish text
text = ftfy.fix_text(text)
text = ftfy.fix_encoding(text)
return text
text_passport_can = data_extraction_with_cleaning_v2(path,'image.jpg','thresh','linear','blur')
有什么办法可以清除这个模糊的文字并提取出来。