我看到numpy bincount的行为是我无法理解的。我想以行方式对二维数组中的值进行分区,并查看下面的行为。为什么它会与dbArray一起使用但是使用simarray会失败?
>>> dbArray
array([[1, 0, 1, 0, 1],
[1, 1, 1, 1, 1],
[1, 1, 0, 1, 1],
[1, 0, 0, 0, 0],
[0, 0, 0, 1, 1],
[0, 1, 0, 1, 0]])
>>> N.apply_along_axis(N.bincount,1,dbArray)
array([[2, 3],
[0, 5],
[1, 4],
[4, 1],
[3, 2],
[3, 2]], dtype=int64)
>>> simarray
array([[2, 0, 2, 0, 2],
[2, 1, 2, 1, 2],
[2, 1, 1, 1, 2],
[2, 0, 1, 0, 1],
[1, 0, 1, 1, 2],
[1, 1, 1, 1, 1]])
>>> N.apply_along_axis(N.bincount,1,simarray)
Traceback (most recent call last):
File "<pyshell#31>", line 1, in <module>
N.apply_along_axis(N.bincount,1,simarray)
File "C:\Python27\lib\site-packages\numpy\lib\shape_base.py", line 118, in apply_along_axis
outarr[tuple(i.tolist())] = res
ValueError: could not broadcast input array from shape (2) into shape (3)
答案 0 :(得分:12)
问题是bincount
并不总是返回相同形状的对象,特别是在缺少值时。例如:
>>> m = np.array([[0,0,1],[1,1,0],[1,1,1]])
>>> np.apply_along_axis(np.bincount, 1, m)
array([[2, 1],
[1, 2],
[0, 3]])
>>> [np.bincount(m[i]) for i in range(m.shape[1])]
[array([2, 1]), array([1, 2]), array([0, 3])]
有效,但是:
>>> m = np.array([[0,0,0],[1,1,0],[1,1,0]])
>>> m
array([[0, 0, 0],
[1, 1, 0],
[1, 1, 0]])
>>> [np.bincount(m[i]) for i in range(m.shape[1])]
[array([3]), array([1, 2]), array([1, 2])]
>>> np.apply_along_axis(np.bincount, 1, m)
Traceback (most recent call last):
File "<ipython-input-49-72e06e26a718>", line 1, in <module>
np.apply_along_axis(np.bincount, 1, m)
File "/usr/local/lib/python2.7/dist-packages/numpy/lib/shape_base.py", line 117, in apply_along_axis
outarr[tuple(i.tolist())] = res
ValueError: could not broadcast input array from shape (2) into shape (1)
不会。
您可以使用minlength
参数并使用lambda
或partial
或其他内容传递:
>>> np.apply_along_axis(lambda x: np.bincount(x, minlength=2), axis=1, arr=m)
array([[3, 0],
[1, 2],
[1, 2]])
答案 1 :(得分:3)
@DSM已经提到,二维数组的bincount不能在不知道数组最大值的情况下完成,因为这将意味着数组大小不一致。
但是由于numpy的索引功能强大,可以轻松实现2d bincount的更快实现,因为它不使用串联或其他任何方式。
def bincount2d(arr, bins=None):
if bins is None:
bins = np.max(arr) + 1
count = np.zeros(shape=[len(arr), bins], dtype=np.int64)
indexing = np.arange(len(arr))
for col in arr.T:
count[indexing, col] += 1
return count
t = np.array([[1,2,3],[4,5,6],[3,2,2]], dtype=np.int64)
print(bincount2d(t))
PS
此:
t = np.empty(shape=[10000, 100], dtype=np.int64)
s = time.time()
bincount2d(t)
e = time.time()
print(e - s)
给出的结果比这快2倍:
t = np.empty(shape=[100, 10000], dtype=np.int64)
s = time.time()
bincount2d(t)
e = time.time()
print(e - s)
因为for循环遍历列。因此,如果shape[0] < shape[1]
,最好转置2d数组。
UPD
没有比这更好的了(我是说仅使用python):
def bincount2d(arr, bins=None):
if bins is None:
bins = np.max(arr) + 1
count = np.zeros(shape=[len(arr), bins], dtype=np.int64)
indexing = (np.ones_like(arr).T * np.arange(len(arr))).T
np.add.at(count, (indexing, arr), 1)
return count
答案 2 :(得分:1)
这是一个完全按照您的要求执行的函数,但没有任何循环。
def sub_sum_partition(a, partition):
"""
Generalization of np.bincount(partition, a).
Sums rows of a matrix for each value of array of non-negative ints.
:param a: array_like
:param partition: array_like, 1 dimension, nonnegative ints
:return: matrix of shape ('one larger than the largest value in partition', a.shape[1:]). The i's element is
the sum of rows j in 'a' s.t. partition[j] == i
"""
assert partition.shape == (len(a),)
n = np.prod(a.shape[1:], dtype=int)
bins = ((np.tile(partition, (n, 1)) * n).T + np.arange(n, dtype=int)).reshape(-1)
sums = np.bincount(bins, a.reshape(-1))
if n > 1:
sums = sums.reshape(-1, *a.shape[1:])
return sums