Question

我想从numpy数组的中间删除零（但并非应删除所有零）

在整个stackoverflow的多个示例中都演示了如何删除零，但是我仍然发现很难为我的问题编写逻辑。

import numpy as np

a = np.array([255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,207,0,0
,159,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,88,239,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,231,88])

我有一个数组a，其中有一些非零值和一个很大的零簇（在非零值中间还有一些零）。我想删除零的大群集或找到该大群集开始的索引。然后将数组简化为以下形式：

a1 = [255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,207,0,0
,159,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,64]

和

a2=[88,239,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,231,88]

请注意，数组a1仍有一些零。仅当操作中有一定数量的连续零（例如10个零）时，才应删除该零。我可以通过遍历数组的循环来做到这一点，但是如果可以提出一种更简单，更快的方法，那将是很好的。

Answer 1

不进行任何导入，只使用一个循环：

def remove_clusters(my_array, cluster_value, consecutive_max=10):
    my_result = [[]]
    cluster_list = []
    for e in my_array:
        if e == cluster_value:
            cluster_list.append(e)
        else:
            if len(cluster_list) <= consecutive_max:
                my_result[-1].extend(cluster_list)
            else:
                my_result.append([])
            cluster_list = []
            my_result[-1].append(e)
    return my_result

我使用itertools.groupby做到了这一点，它简化了一些代码：

def remove_clusters(my_array, cluster_value=0, max_consecutive=10):
    from itertools import groupby
    my_result = [[]]
    for k,g in groupby(my_array):
        g = list(g)
        if k != cluster_value or len(g) <= max_consecutive:
            my_result[-1].extend(g)
        else:
            my_result.append([])
    return my_result

那么您可以做：

a1, a2 = remove_clusters(a)

最后，一个非常丑陋的oneliner使用functools.reduce

from itertools import groupby
from functools import reduce
a1, a2 = reduce(lambda x,y: x + [[]] if not y[0] and len(y)>10 
                       else x[:-1] + [x[-1]+y], 
                map(lambda x: list(x[1]), groupby(a)), 
                [[]])

我很想解释这个oneliner，但我已经不明白了。

Answer 2

这是另一种使用numpy而不是实际上不使用任何数组的方法（列表推导之外）。基本思想是获取[(value1, count1), (value2, count2)...]的列表，然后在该列表中搜索所需的条件。

有几处可以改进的地方，主要是对条件进行两次检查。

import numpy as np

a = np.array([255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,207,0,0
,159,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255
,255,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
,0,0,0,0,0,0,0,0,0,88,239,255,255,255,255,255,255,255
,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,231,88])

def remove_consecutive(thearray, num, count):

    # Find the indices where there is a *change* in value
    adiff = np.diff(a)
    indices = np.hstack((np.array([0]), np.where(adiff)[0]+1))

    # Given the indices we can find the lengths of the lists by just doing a                                                                     
    # diff on the indices. Have to append a dummy so the last value is included.                                                                 
    lengths = np.diff(indices, append=indices[-1]+1)

    # Now construct a tuple of (value, length)
    the_list = list(zip(a[indices], lengths))

    # Find the places where we need to split based on the num/count requirement.                                                                 
    index_breaks = np.array([ii for ii, (lvalue, lcount) in enumerate(the_list) if lvalue == num and lcount > count])                            

    # Split the (value,length) list based on the index_breaks
    the_list_split = np.split(the_list, index_breaks)

    # Now expand back out.
    output_list = [ np.array([lvalue for lvalue, lcount in sublist for _ in range(lcount) if not( lvalue == num and lcount > count)])            
                    for sublist in the_list_split]

    return np.array(output_list).flatten()

a1, a2 = remove_consecutive(a, 0, 10)

print(a1)
print(a2)

输出为：

[255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
   0 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
 255 255 207   0   0 159 255 255 255 255 255 255 255 255 255 255 255 255
 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
 255 255 255 255 255 255  64]
[ 88 239 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255 255
 255 255 255 255 255 255 231  88]

有没有办法了解零的簇，然后将其从numpy数组中删除？

2 个答案: