假设我有一个numpy
数组,它在两种项目类型的ID之间进行映射:
[[1, 12],
[1, 13],
[1, 14],
[2, 13],
[2, 14],
[3, 11]]
我想重新排列此数组,以便新数组中的每一行代表与原始数组中相同ID匹配的所有项。这里,每列代表原始数组中的一个映射,直到新数组中列数的指定形状限制。如果我们想从上面的数组中获得这个结果,确保我们只有2列,我们将获得:
[[12, 13], #Represents 1 - 14 was not kept as only 2 columns are allowed
[13, 14], #Represents 2
[11, 0]] #Represents 3 - 0 was used as padding since 3 did not have 2 mappings
这里天真的方法是使用for循环来填充新数组,因为它遇到原始数组中的行。是否有更有效的方法来实现numpy
的功能?
答案 0 :(得分:3)
这是一般的,主要是Numpythonic方法:
In [144]: def array_packer(arr):
...: cols = arr.shape[1]
...: ids = arr[:, 0]
...: inds = np.where(np.diff(ids) != 0)[0] + 1
...: sp = np.split(arr[:,1:], inds)
...: result = [np.unique(a[: cols]) if a.shape[0] >= cols else
...: np.pad(np.unique(a), (0, (cols - 1) * (cols - a.shape[0])), 'constant')
...: for a in sp]
...: return result
...:
...:
演示:
In [145]: a = np.array([[1, 12, 15, 45],
...: [1, 13, 23, 9],
...: [1, 14, 14, 11],
...: [2, 13, 90, 34],
...: [2, 14, 23, 43],
...: [3, 11, 123, 53]])
...:
In [146]: array_packer(a)
Out[146]:
[array([ 9, 11, 12, 13, 14, 15, 23, 45, 0, 0, 0]),
array([13, 14, 23, 34, 43, 90, 0, 0, 0, 0, 0, 0]),
array([ 11, 53, 123, 0, 0, 0, 0, 0, 0, 0, 0, 0])]
In [147]: a = np.array([[1, 12, 15],
...: [1, 13, 23],
...: [1, 14, 14],
...: [2, 13, 90],
...: [2, 14, 23],
...: [3, 11, 123]])
...:
...:
...:
In [148]: array_packer(a)
Out[148]:
[array([12, 13, 14, 15, 23]),
array([13, 14, 23, 90, 0, 0]),
array([ 11, 123, 0, 0, 0, 0])]
答案 1 :(得分:2)
对于这个问题,天真的for循环实际上是一个非常有效的解决方案:
from collections import defaultdict, deque
d = defaultdict(lambda: deque((0, 0), maxlen=2))
%%timeit
for key, val in a:
d[key].append(val)
4.43 µs ± 29.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
# result: {1: deque([13, 14]), 2: deque([13, 14]), 3: deque([0, 11])}
为了进行比较,这个线程中提出的numpy解决方案慢了4倍:
%timeit [[*a[a[:,0]==i,1],0][:2] for i in np.unique(a[:,0])]
18.6 µs ± 336 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Numpy非常好,我自己也经常使用它,但我认为这种情况很麻烦。
答案 2 :(得分:2)
这是一种使用稀疏矩阵的方法:
def pp(map_, maxitems=2):
M = sparse.csr_matrix((map_[:, 1], map_[:, 0], np.arange(map_.shape[0]+1)))
M = M.tocsc()
sizes = np.diff(M.indptr)
ids, = np.where(sizes)
D = np.concatenate([M.data, np.zeros((maxitems - 1,), dtype=M.data.dtype)])
D = np.lib.stride_tricks.as_strided(D, (D.size - maxitems + 1, maxitems),
2 * D.strides)
result = D[M.indptr[ids]]
result[np.arange(maxitems) >= sizes[ids, None]] = 0
return result
使用@crisz的代码进行计时,但修改后使用较少重复的测试数据。我还添加了一些“验证”:chrisz和我的解决方案给出相同的答案,另外两个输出不同的格式,所以我无法检查它们。
代码:
from scipy import sparse
import numpy as np
from collections import defaultdict, deque
def pp(map_, maxitems=2):
M = sparse.csr_matrix((map_[:, 1], map_[:, 0], np.arange(map_.shape[0]+1)))
M = M.tocsc()
sizes = np.diff(M.indptr)
ids, = np.where(sizes)
D = np.concatenate([M.data, np.zeros((maxitems - 1,), dtype=M.data.dtype)])
D = np.lib.stride_tricks.as_strided(D, (D.size - maxitems + 1, maxitems),
2 * D.strides)
result = D[M.indptr[ids]]
result[np.arange(maxitems) >= sizes[ids, None]] = 0
return result
def chrisz(a):
return [[*a[a[:,0]==i,1],0][:2] for i in np.unique(a[:,0])]
def piotr(a):
d = defaultdict(lambda: deque((0, 0), maxlen=2))
for key, val in a:
d[key].append(val)
return d
def karams(arr):
cols = arr.shape[1]
ids = arr[:, 0]
inds = np.where(np.diff(ids) != 0)[0] + 1
sp = np.split(arr[:,1:], inds)
result = [a[:2].ravel() if a.size >= cols else np.pad(a.ravel(), (0, cols -1 * (cols - a.size)), 'constant')for a in sp]
return result
def make(nid, ntot):
return np.c_[np.random.randint(0, nid, (ntot,)),
np.random.randint(0, 2**30, (ntot,))]
from timeit import timeit
import pandas as pd
import matplotlib.pyplot as plt
res = pd.DataFrame(
index=['pp', 'chrisz', 'piotr', 'karams'],
columns=[10, 50, 100, 500, 1000, 5000, 10000],# 50000],
dtype=float
)
for c in res.columns:
# l = np.repeat(np.array([[1, 12],[1, 13],[1, 14],[2, 13],[2, 14],[3, 11]]), c, axis=0)
l = make(c // 2, c * 6)
assert np.all(chrisz(l) == pp(l))
for f in res.index:
stmt = '{}(l)'.format(f)
setp = 'from __main__ import l, {}'.format(f)
res.at[f, c] = timeit(stmt, setp, number=30)
ax = res.div(res.min()).T.plot(loglog=True)
ax.set_xlabel("N");
ax.set_ylabel("time (relative)");
plt.show()
答案 3 :(得分:1)
略微改编自几乎复制到pad并仅选择两个元素:
[[*a[a[:,0]==i,1],0][:2] for i in np.unique(a[:,0])]
输出:
[[12, 13], [13, 14], [11, 0]]
如果您想跟踪密钥:
{i:[*a[a[:,0]==i,1],0][:2] for i in np.unique(a[:,0])}
# {1: [12, 13], 2: [13, 14], 3: [11, 0]}
<强> 功能 强>
def chrisz(a):
return [[*a[a[:,0]==i,1],0][:2] for i in np.unique(a[:,0])]
def piotr(a):
d = defaultdict(lambda: deque((0, 0), maxlen=2))
for key, val in a:
d[key].append(val)
return d
def karams(arr):
cols = arr.shape[1]
ids = arr[:, 0]
inds = np.where(np.diff(ids) != 0)[0] + 1
sp = np.split(arr[:,1:], inds)
result = [a[:2].ravel() if a.size >= cols else np.pad(a.ravel(), (0, cols -1 * (cols - a.size)), 'constant')for a in sp]
return result
<强> 计时 强>
from timeit import timeit
import pandas as pd
import matplotlib.pyplot as plt
res = pd.DataFrame(
index=['chrisz', 'piotr', 'karams'],
columns=[10, 50, 100, 500, 1000, 5000, 10000, 50000],
dtype=float
)
for f in res.index:
for c i
n res.columns:
l = np.repeat(np.array([[1, 12],[1, 13],[1, 14],[2, 13],[2, 14],[3, 11]]), c, axis=0)
stmt = '{}(l)'.format(f)
setp = 'from __main__ import l, {}'.format(f)
res.at[f, c] = timeit(stmt, setp, number=30)
ax = res.div(res.min()).T.plot(loglog=True)
ax.set_xlabel("N");
ax.set_ylabel("time (relative)");
plt.show()
结果 (显然@Kasramvd是赢家):