假设我有2个numpy数组,其中包含以下元素:
arr1 = [[1, 2], [3, 5], [3, 4]]
arr2 = [[2, 3], [3, 4], [6, 6]]
我希望结果数组附加arr2
或水平堆叠到
arr1
包含两个数组中不存在的元素:
arr3 = [[1, 2], [3, 5], [2, 3], [6, 6]]
如您所见,[3, 4]
在预期的结果数组中不存在。什么是最好的numpy和pythonic实现?
答案 0 :(得分:3)
我参加聚会有点晚了,但这是一种仅以适度的算法成本利用numpy
速度的方法:O(n log n)。事实证明,对于许多大小的输入数组,运行时由类型转换的成本占主导地位。见下面的基准:
from timeit import timeit
import numpy as np
def make_inputs(N, ncol):
global arr1, arr2, list1, list2, lot1, lot2
# create making sure there are no duplicates *within* arr1 or arr2
all_ = np.array(list(set(map(tuple, np.random.randint(0, 2 * N, (N + N//2, ncol))))))
# create input of various data types
arr1 = all_[np.random.choice(len(all_), N, False)]
arr2 = all_[np.random.choice(len(all_), N, False)]
list1 = arr1.tolist()
list2 = arr2.tolist()
lot1 = list(map(tuple, list1))
lot2 = list(map(tuple, list2))
def np_unique_preserve_order(a, b):
c = np.r_[a, b]
cr = c.view(np.dtype("|S" + str(c.shape[-1] * c.dtype.itemsize)))
uniq, inv, count = np.unique(cr.ravel(), return_inverse=True,
return_counts=True)
return c[(count==1)[inv], :]
def np_unique(a, b):
c = np.r_[a, b]
cr = c.view(np.dtype("|S" + str(c.shape[-1] * c.dtype.itemsize)))
uniq, count = np.unique(cr.ravel(), return_counts=True)
return uniq[count==1, None].view(c.dtype)
def np_sort(a, b):
c = np.r_[a, b]
cr = np.sort(c.view(np.dtype("|S" + str(c.shape[-1] * c.dtype.itemsize))).ravel())
m = np.empty(cr.shape, bool)
m[0] = True
m[1:] = cr[:-1] != cr[1:]
m[:-1] &= m[1:]
return cr[m, None].view(c.dtype)
# check
make_inputs(1000, 2)
assert set(map(tuple, lot1)).symmetric_difference(set(map(tuple, lot2))) == set(map(tuple, np_sort(arr1, arr2)))
assert set(map(tuple, lot1)).symmetric_difference(set(map(tuple, lot2))) == set(map(tuple, np_unique(arr1, arr2)))
assert set(map(tuple, lot1)).symmetric_difference(set(map(tuple, lot2))) == set(map(tuple, np_unique_preserve_order(arr1, arr2)))
for N, ncol in ((10, 2), (10000, 2), (100000, 20)):
make_inputs(N, ncol)
results = []
for inputs in 'lot', 'list', 'arr':
res = []
if inputs == 'lot':
res.append('{:11.5f} ms'.format(timeit(f'list(set({inputs}1).symmetric_difference(set({inputs}2)))',
f'from __main__ import {inputs}1, {inputs}2', number=10) * 100))
else:
res.append('{:11.5f} ms'.format(timeit(f'list(set(map(tuple, {inputs}1)).symmetric_difference(set(map(tuple, {inputs}2))))',
f'from __main__ import {inputs}1, {inputs}2', number=10) * 100))
res.append('{:11.5f} ms'.format(timeit(f'np_sort({inputs}1, {inputs}2)', f'from __main__ import {inputs}1, {inputs}2, np_sort',
number=10) * 100))
res.append('{:11.5f} ms'.format(timeit(f'np_unique({inputs}1, {inputs}2)', f'from __main__ import {inputs}1, {inputs}2, np_unique',
number=10) * 100))
res.append('{:11.5f} ms'.format(timeit(f'np_unique_preserve_order({inputs}1, {inputs}2)', f'from __main__ import {inputs}1, {inputs}2, np_unique_preserve_order',
number=10) * 100))
results.append(res)
results = zip(*results)
appmin = lambda l: l + (min(l),)
print(f'\nno rows {N}, no colums {ncol}')
print('input type lot list array best')
print(f'symm_diff ', *appmin(next(results)))
print(f'np_sort ', *appmin(next(results)))
print(f'np_unique ', *appmin(next(results)))
print(f'np_unique_preserve_order ', *appmin(next(results)))
输出:
no rows 10, no colums 2
input type lot list array best
symm_diff 0.00232 ms 0.00453 ms 0.02034 ms 0.00232 ms
np_sort 0.03890 ms 0.03269 ms 0.03060 ms 0.03060 ms
np_unique 0.04209 ms 0.04118 ms 0.04136 ms 0.04118 ms
np_unique_preserve_order 0.05289 ms 0.05253 ms 0.05253 ms 0.05253 ms
no rows 10000, no colums 2
input type lot list array best
symm_diff 3.71800 ms 8.30081 ms 21.92841 ms 3.71800 ms
np_sort 7.98128 ms 8.09973 ms 2.80091 ms 2.80091 ms
np_unique 8.49229 ms 8.25701 ms 3.00654 ms 3.00654 ms
np_unique_preserve_order 10.12377 ms 8.67133 ms 3.36172 ms 3.36172 ms
no rows 100000, no colums 20
input type lot list array best
symm_diff 97.83141 ms 211.20736 ms 590.15145 ms 97.83141 ms
np_sort 317.73268 ms 316.50081 ms 89.97820 ms 89.97820 ms
np_unique 324.68922 ms 326.89891 ms 98.06377 ms 98.06377 ms
np_unique_preserve_order 339.18597 ms 339.00286 ms 120.94041 ms 120.94041 ms
对于非常小的数组symm_diff
是最快的,但对于较大的数组np_sort
在允许所有方法使用最适合的数据类型时有一个小的边缘。
答案 1 :(得分:2)
arr_3 = list(set(arr_1).symmetric_difference(set(arr_2)))
或:
arr_3 = list(set(map(tuple, arr_1))
.symmetric_difference(set(map(tuple, arr_2))))
一点研究:
from random import randint
from timeit import timeit
import itertools
arr1 = [[randint(0, 5), randint(0, 5)] for _ in range(1000)]
arr2 = [[randint(0, 5), randint(0, 5)] for _ in range(1000)]
def symmetric_difference(a, b):
now_exists = set()
results = []
for e in itertools.chain(a, b):
rerp_of_e = repr(e)
if rerp_of_e not in now_exists:
now_exists.add(rerp_of_e)
results.append(e)
return results
def method2(a, b):
c = a + b
return [l for l in c if c.count(l) == 1]
print(timeit('[l for l in arr1 + arr2 if (arr1 + arr2).count(l) == 1]', 'from __main__ import arr1, arr2',
number=10) / 10)
print(timeit('method2(arr1, arr2)', 'from __main__ import arr1, arr2, method2',
number=10) / 10)
print(timeit('list(set(map(tuple, arr1)).symmetric_difference(set(map(tuple, arr2))))',
'from __main__ import arr1, arr2', number=10) / 10)
print(timeit('symmetric_difference(arr1, arr2)', 'from __main__ import arr1, arr2, symmetric_difference',
number=10) / 10)
输出:
0.16653713929933542
0.14676828165012284
0.00030277483018301685
0.0008909022958581315
答案 2 :(得分:2)
怎么样:
[l for l in arr1+arr2 if (arr1+arr2).count(l)==1]
输出:
[[1, 2], [3, 5], [2, 3], [6, 6]]
或者,如果你想要它更有效:
c=arr1+arr2
[l for l in c if c.count(l)==1]
答案 3 :(得分:2)
如果只有相同索引中的项目可以相似,您可以使用简单的numpythonic方法,如下所示:
In [10]: mask = ~(arr1 == arr2).all(1)
In [11]: np.vstack((arr1[mask], arr2[mask]))
Out[11]:
array([[1, 2],
[3, 5],
[2, 3],
[6, 6]])
否则,您可以根据Jaime's answer找到交叉点,然后将它们从组合数组中排除。
或使用以下方法:
In [17]: arr1_view = arr1.view([('', arr1.dtype)] * arr1.shape[1])
In [18]: arr2_view = arr2.view([('', arr2.dtype)] * arr2.shape[1])
In [19]: diff_arr1 = np.setdiff1d(arr1_view, arr2_view).view(arr1.dtype).reshape(-1, arr1.shape[1])
In [20]: diff_arr2 = np.setdiff1d(arr2_view, arr1_view).view(arr2.dtype).reshape(-1, arr2.shape[1])
In [21]: np.vstack((diff_arr1, diff_arr2))
Out[21]:
array([[1, 2],
[3, 5],
[2, 3],
[6, 6]])
如果你正在处理python数组和特别短的数组,你可以使用如下列表理解:
In [204]: [i for i in arr1 + arr2 if not (i in arr1 and i in arr2)]
Out[204]: [[1, 2], [3, 5], [2, 3], [6, 6]]
请注意,它比将列表转换为元组更快,而对于较大的数组使用set for short arrays bit,基于集合的方法需要蛋糕,在这种情况下,使用Numpy仍然更好:
In [209]: %timeit set(map(tuple, arr1)).symmetric_difference(map(tuple, arr2))
100000 loops, best of 3: 1.51 µs per loop
In [210]: %timeit [i for i in arr1 + arr2 if not (i in arr1 and i in arr2)]
1000000 loops, best of 3: 1.28 µs per loop