考虑一下我有这些清单:
l = [5,6,7,8,9,10,5,15,20]
m = [10,5]
我想在m
中获得l
的索引。我使用列表理解来做到这一点:
[(i,i+1) for i,j in enumerate(l) if m[0] == l[i] and m[1] == l[i+1]]
输出:[(5,6)]
但如果我在m
中有更多数字,我觉得这不是正确的方法。那么Python或NumPy有什么简单的方法吗?
另一个例子:
l = [5,6,7,8,9,10,5,15,20,50,16,18]
m = [10,5,15,20]
输出应为:
[(5,6,7,8)]
答案 0 :(得分:6)
您基本上是在另一个列表中查找列表的起始索引。
方法#1:解决这个问题的一种方法是创建我们正在搜索的列表中元素的滑动窗口,给我们一个2D
数组,然后简单地使用{ {1}}针对搜索列表针对之前获得的NumPy broadcasting
滑动窗口版本的每一行执行广播比较。因此,一种方法是 -
2D
样品运行 -
# strided_app is from https://stackoverflow.com/a/40085052/
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
def pattern_index_broadcasting(all_data, search_data):
n = len(search_data)
all_data = np.asarray(all_data)
all_data_2D = strided_app(np.asarray(all_data), n, S=1)
return np.flatnonzero((all_data_2D == search_data).all(1))
out = np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
方法#2:另一种方法是获取滑动窗口,然后将行标量视图转换为数据,作为搜索数据和要搜索的数据,给我们{ {1}}要使用的数据,如此 -
In [340]: l = [5,6,7,8,9,10,5,15,20,50,16,18]
...: m = [10,5,15,20]
...:
In [341]: np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Out[341]: array([5, 6, 7, 8])
In [342]: l = [5,6,7,8,9,10,5,15,20,50,16,18,10,5,15,20]
...: m = [10,5,15,20]
...:
In [343]: np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
Out[343]:
array([[ 5, 6, 7, 8],
[12, 13, 14, 15]])
答案 1 :(得分:5)
最简单的方法(使用纯Python)将迭代项目,首先只检查第一项是否匹配。这避免了在不需要时进行子列表比较。根据{{1}}的内容, 甚至可以胜过NumPy广播解决方案:
l
如果您对速度感兴趣,我会检查方法的效果(借鉴我的设置here):
def func(haystack, needle): # obviously needs a better name ...
if not needle:
return
# just optimization
lengthneedle = len(needle)
firstneedle = needle[0]
for idx, item in enumerate(haystack):
if item == firstneedle:
if haystack[idx:idx+lengthneedle] == needle:
yield tuple(range(idx, idx+lengthneedle))
>>> list(func(l, m))
[(5, 6, 7, 8)]
如果您的import random
import numpy as np
# strided_app is from https://stackoverflow.com/a/40085052/
def strided_app(a, L, S ): # Window len = L, Stride len/stepsize = S
nrows = ((a.size-L)//S)+1
n = a.strides[0]
return np.lib.stride_tricks.as_strided(a, shape=(nrows,L), strides=(S*n,n))
def pattern_index_broadcasting(all_data, search_data):
n = len(search_data)
all_data = np.asarray(all_data)
all_data_2D = strided_app(np.asarray(all_data), n, S=1)
return np.flatnonzero((all_data_2D == search_data).all(1))
# view1D is from https://stackoverflow.com/a/45313353/
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
def pattern_index_view1D(all_data, search_data):
a = strided_app(np.asarray(all_data), L=len(search_data), S=1)
a0v, b0v = view1D(np.asarray(a), np.asarray(search_data))
return np.flatnonzero(np.in1d(a0v, b0v))
def find_sublist_indices(haystack, needle):
if not needle:
return
# just optimization
lengthneedle = len(needle)
firstneedle = needle[0]
restneedle = needle[1:]
for idx, item in enumerate(haystack):
if item == firstneedle:
if haystack[idx+1:idx+lengthneedle] == restneedle:
yield tuple(range(idx, idx+lengthneedle))
def Divakar1(l, m):
return np.squeeze(pattern_index_broadcasting(l, m)[:,None] + np.arange(len(m)))
def Divakar2(l, m):
return np.squeeze(pattern_index_view1D(l, m)[:,None] + np.arange(len(m)))
def MSeifert(l, m):
return list(find_sublist_indices(l, m))
# Timing setup
timings = {Divakar1: [], Divakar2: [], MSeifert: []}
sizes = [2**i for i in range(5, 20, 2)]
# Timing
for size in sizes:
l = [random.randint(0, 50) for _ in range(size)]
m = [random.randint(0, 50) for _ in range(10)]
larr = np.asarray(l)
marr = np.asarray(m)
for func in timings:
# first timings:
# res = %timeit -o func(l, m)
# second timings:
if func is MSeifert:
res = %timeit -o func(l, m)
else:
res = %timeit -o func(larr, marr)
timings[func].append(res)
%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
fig = plt.figure(1)
ax = plt.subplot(111)
for func in timings:
ax.plot(sizes,
[time.best for time in timings[func]],
label=str(func.__name__))
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('size')
ax.set_ylabel('time [seconds]')
ax.grid(which='both')
ax.legend()
plt.tight_layout()
和l
是列表,我的功能优于所有尺寸的NumPy解决方案:
但是如果您使用Divakars NumPy解决方案,那么当您使用这些numpy数组时,大型数组(大小> 1000个元素)的结果会更快:
答案 2 :(得分:2)