每当我尝试建立一个以列标题为父级并将基础单元格作为其子级的层次结构时,随着轮廓顺序的中断,顺序就会中途中断。
我正在尝试对包含表格扫描图像的pdf进行OCR。我已经完成了所有必要的预处理,找到了所需的轮廓并将它们“从左到右”排序。
def sort_contours(cnts, method="left-to-right"):
# initialize the reverse flag and sort index
reverse = False
i = 0
# handle if we need to sort in reverse
if method == "right-to-left" or method == "bottom-to-top":
reverse = True
# handle if we are sorting against the y-coordinate rather than
# the x-coordinate of the bounding box
if method == "top-to-bottom" or method == "bottom-to-top":
i = 1
# construct the list of bounding boxes and sort them from top to
# bottom
boundingBoxes = [cv2.boundingRect(c) for c in cnts]
(cnts, boundingBoxes) = zip(*sorted(zip(cnts, boundingBoxes),
key=lambda b:b[1][i], reverse=reverse))
# return the list of sorted contours and bounding boxes
return (cnts, boundingBoxes)
def get_cell_contours(img, i):
"""
Purpose : Morphological operation to detect cell outlines from an image and get the column contours
"""
# Defining a kernel length
kernel_length = np.array(img).shape[0]//i
# A vertical kernel of (1 X kernel_length), which will detect all the vertical lines from the image.
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel_length))
# A horizontal kernel of (kernel_length X 1), which will help to detect all the horizontal line from the image.
hori_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (kernel_length, 1))
# A kernel of (3 X 3) ones.
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
# Morphological operation to detect vertical lines from an image
img_temp1 = cv2.erode(img, vertical_kernel, iterations=3)
vertical_lines_img = cv2.dilate(img_temp1, vertical_kernel, iterations=3)
# cv2.imwrite("vertical_lines.jpg",vertical_lines_img)
# Morphological operation to detect horizontal lines from an image
img_temp2 = cv2.erode(img, hori_kernel, iterations=3)
horizontal_lines_img = cv2.dilate(img_temp2, hori_kernel, iterations=3)
# cv2.imwrite("horizontal_lines.jpg",horizontal_lines_img)
# Weighting parameters, this will decide the quantity of an image to be added to make a new image.
alpha = 1
beta = 1
# This function helps to add two image with specific weight parameter to get a third image as summation of two image.
img_final_bin = cv2.addWeighted(vertical_lines_img, alpha, horizontal_lines_img, beta, 0.0)
# Find contours for image, which will detect all the boxes
contours, hierarchy = cv2.findContours(
img_final_bin, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
img_final_bin = cv2.cvtColor(img_final_bin, cv2.COLOR_GRAY2BGR)
im2 = cv2.drawContours(img_final_bin, contours, -1, (128,255,0), 3)
# Sort all the contours by right to left
(contours, boundingBoxes) = sort_contours(contours, method="right-to-left")
return im2, contours, hierarchy
def get_contour_tree(img, contours):
""" This finds out the necessary hierarchy """
mean_length = cv2.arcLength(contours[1], closed = True)
array = []
column = []
j = 0
for i, contour in enumerate(contours):
length = cv2.arcLength(contour, closed = True)
if length > (1.5 * mean_length):
column = [(j, [contour])] + column
array.append(column)
column = []
mean_length = length
elif length < (0.67 * mean_length):
mean_length = length
j = 1
column = [(j, [contour])] + column
else:
column = [(j, [contour])] + column
j = j + 1
mean_length = ((mean_length * (j - 1)) + length)/j
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
print(len(array))
# For Debugging purposes
for j, column in enumerate(array):
print("........")
for i, contour in (array[j]):
# dup_img = img.copy()
img = cv2.drawContours(img, contour, -1, (128,255,0), -1)
cv2.imwrite("Cell.jpg", img)
time.sleep(5)
# contour = array[3][-2][1]
# im2 = cv2.drawContours(img, np.array(contour), -1, (128,255,0), -1)
# cv2.imwrite("Cell.jpg", im2)
return array
我希望数字顺序正确,即671
之后的数字应该是672
,而不是643
。