Question

我有一个label_image，像这样的数据框。请注意，某些对象（标记为1和3的对象已被其他对象完全吞没/包围，没有背景将这些对象彼此分开。 label_image在此处作为输入给出。我没有自己制作图像，因为我没有实际图像（如tif，jpg等）

dummy_img = pd.DataFrame(np.array([
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [0,0,0,2,2,2,2,2,2,2,2,2,4,4,4,4,4,0,0,0],
    [0,0,0,2,2,2,2,2,2,2,2,2,4,4,4,4,4,0,0,0],
    [0,0,0,2,2,2,2,2,2,2,2,2,3,3,3,4,4,0,0,0],
    [0,0,0,2,2,1,1,1,1,1,3,3,3,3,3,4,4,0,0,0],
    [0,0,0,2,2,1,1,1,1,1,3,3,3,3,3,4,4,0,0,0],
    [0,0,0,2,2,1,1,1,1,1,3,3,3,3,3,4,4,0,0,0],
    [0,0,0,2,2,1,1,1,1,1,3,3,3,3,3,4,4,0,0,0],
    [0,0,0,2,2,2,2,2,2,2,2,2,3,3,3,4,4,0,0,0],
    [0,0,0,2,2,2,2,2,2,2,2,2,4,4,4,4,4,0,0,0],
    [0,0,0,2,2,2,2,2,2,2,2,2,4,4,4,4,4,0,0,0],
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
]))

我想得到对象的轮廓。

我最初写的是：

import pandas as pd
import numpy as np
from scipy.sparse import coo_matrix
from scipy.ndimage import binary_erosion

def outline(label_image):
    res_list = []
    
    mask = binary_erosion(label_image.values)
    label_image[mask] = 0
    c = coo_matrix(label_image)
    if c.data.size > 0:
        df = pd.DataFrame({'coords': list(zip(c.col, c.row)), 'label': c.data})
        df = df.groupby('label')['coords'].apply(lambda group_series: group_series.tolist()).reset_index()
        df = df.astype({"label": int})
    else:
        df = pd.DataFrame()
    return df

作为输出：

label   coords
2       [(3, 3), (4, 3), (5, 3), (6, 3), (7, 3), (8, 3...
4       [(12, 3), (13, 3), (14, 3), (15, 3), (16, 3), ...

这是错误的。它不仅错过了内部对象，而且为检测到的对象导出的坐标也是错误的。例如，对于label 4，它返回labels 4 and 0之间的轮廓，而忽略labels 4 and 3之间的轮廓。

我做了此修复程序

def outline_fix(label_image):
    res_list = []
    coo = coo_matrix(label_image)
    labels = np.unique(coo.data)
    for label in sorted(set(labels)):
        #print('label: %d' % label)
        c = coo.copy()
        c.data[c.data != label] = 0
        c = c.toarray()
        mask = binary_erosion(c)
        c[mask] = 0
        c = coo_matrix(c)
        if c.data.size > 0:
            df = pd.DataFrame({'coords': list(zip(c.col, c.row)), 'label': c.data})
            df = df.groupby('label')['coords'].apply(lambda group_series: group_series.tolist()).reset_index()
            df = df.astype({"label": int})
        else:
            df = pd.DataFrame()
        res_list.append(df)
        
    if res_list:
        out = pd.concat(res_list).astype({"label": int})
    else:
        out = pd.DataFrame()
        
    return out

返回：

label   coords
1       [(5, 6), (6, 6), (7, 6), (8, 6), (9, 6), (5, 7...
2       [(3, 3), (4, 3), (5, 3), (6, 3), (7, 3), (8, 3...
3       [(12, 5), (13, 5), (14, 5), (10, 6), (11, 6), ...
4       [(12, 3), (13, 3), (14, 3), (15, 3), (16, 3), ...

效果很好。不是100％完美，因为例如label 4会丢失两对坐标。我得到的坐标列表的长度应为length = 32而不是30，但这是我可以忍受的，并不是很重要。

功能更正的问题是速度太慢。在实际情况下，我有一个2000-by-2000对象超过2800个的数组。并非所有的对象都是“嵌套”的，但可能很多，具体取决于输入的图像。

代码在大约4分钟内完成（对于一个真实的现实情况），这太长了。请问有没有其他选择，或者有什么想法可以加快速度？

任何帮助表示赞赏

Answer 1

根据我的经验，当您必须使用.apply函数时，熊猫的速度非常慢。因此，我更喜欢对.values进行操作，然后重新分配。话虽如此，熊猫可以做得很快，但这是art of its own。检查此代码，对于大图像，它肯定比熊猫实现要快：

import numpy as np
import pandas as pd
from collections import Counter

dummy_img = pd.DataFrame(np.array([
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [0,0,0,2,2,2,2,2,2,2,2,2,4,4,4,4,4,0,0,0],
    [0,0,0,2,2,2,2,2,2,2,2,2,4,4,4,4,4,0,0,0],
    [0,0,0,2,2,2,2,2,2,2,2,2,3,3,3,4,4,0,0,0],
    [0,0,0,2,2,1,1,1,1,1,3,3,3,3,3,4,4,0,0,0],
    [0,0,0,2,2,1,1,1,1,1,3,3,3,3,3,4,4,0,0,0],
    [0,0,0,2,2,1,1,1,1,1,3,3,3,3,3,4,4,0,0,0],
    [0,0,0,2,2,1,1,1,1,1,3,3,3,3,3,4,4,0,0,0],
    [0,0,0,2,2,2,2,2,2,2,2,2,3,3,3,4,4,0,0,0],
    [0,0,0,2,2,2,2,2,2,2,2,2,4,4,4,4,4,0,0,0],
    [0,0,0,2,2,2,2,2,2,2,2,2,4,4,4,4,4,0,0,0],
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
    [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
]))

# Array of boundary points
# row - label, col - point coordinates

img_np = dummy_img.values
b_pts = [[] for i in range(4)] 

# idea is simple - run a 3x3 window and check whenever the mean value
# inside the window differs from element at the center. 

img_padded = np.pad(img_np, ((1, 1), (1, 1)))
    
for r, row in enumerate(img_np):
  for c, elem in enumerate(row):
    if Flase in (img_padded[r:r+3, c:c+3] == elem):
      b_pts[elem-1].append((r, c))

for elem in b_pts:     
  print(elem)

检测图像上的对象边界太慢

1 个答案: