识别以csc_matrix格式表示的稀疏矩阵的唯一列以及每列重复的时间有什么好方法?
我没有关于矩阵元素的先验信息。它是替换另一个矩阵的列的抽样结果,因此我可以有重复的列,因为在原始矩阵中有一个列被重复采样多次。因此,我不能将numpy.unique
应用于采样列的索引,我认为将整个矩阵转换为密集格式然后将numpy.unique
应用于它是不错的选择。
答案 0 :(得分:1)
您可以按每列中的非零数量进行排序和分组。然后每个组按索引和值排序,并分成不变的块:
import numpy as np
from scipy import sparse
def sparse_unique_columns(M):
M = M.tocsc()
m, n = M.shape
if not M.has_sorted_indices:
M.sort_indices()
if not M.has_canonical_format:
M.sum_duplicates()
sizes = np.diff(M.indptr)
idx = np.argsort(sizes)
Ms = M@sparse.csc_matrix((np.ones((n,)), idx, np.arange(n+1)), (n, n))
ssizes = np.diff(Ms.indptr)
ssizes[1:] -= ssizes[:-1]
grpidx, = np.where(ssizes)
grpidx = np.concatenate([grpidx, [n]])
if ssizes[0] == 0:
counts = [np.array([0, grpidx[0]])]
else:
counts = [np.zeros((1,), int)]
ssizes = ssizes[grpidx[:-1]].cumsum()
for i, ss in enumerate(ssizes):
gil, gir = grpidx[i:i+2]
pl, pr = Ms.indptr[[gil, gir]]
dv = Ms.data[pl:pr].view(f'V{ss*Ms.data.dtype.itemsize}')
iv = Ms.indices[pl:pr].view(f'V{ss*Ms.indices.dtype.itemsize}')
idxi = np.lexsort((dv, iv))
dv = dv[idxi]
iv = iv[idxi]
chng, = np.where(np.concatenate(
[[True], (dv[1:] != dv[:-1]) | (iv[1:] != iv[:-1]), [True]]))
counts.append(np.diff(chng))
idx[gil:gir] = idx[gil:gir][idxi]
counts = np.concatenate(counts)
nu = counts.size - 1
uniques = M@sparse.csc_matrix((np.ones((nu,)), idx[counts[:-1].cumsum()],
np.arange(nu + 1)), (n, nu))
return uniques, idx, counts[1:]
a = np.random.uniform(0, 10, (1000, 200))
a[a>1] = 0
a = sparse.csc_matrix(a)
b = sparse.csc_matrix((np.ones(1000), np.random.randint(0, 200, (1000,)), np.arange(1001)))
c = a@b
unq, idx, cnt = sparse_unique_columns(c)
unqd, idxd, cntd = np.unique(c.A, axis=1, return_counts=True, return_inverse=True)
from timeit import timeit
print('sparse:', timeit(lambda: sparse_unique_columns(c), number=1000), 'ms')
print('dense: ', timeit(lambda: np.unique(c.A, axis=1, return_counts=True), number=100)*10, 'ms')
示例输出:
sparse: 2.735588440205902 ms
dense: 49.32689592242241 ms