import time
np.random.seed(0)
df = pd.DataFrame({'gr': np.random.choice(7000, 500000),
'col': np.random.choice(1000, 500000)})
groups = df.groupby('gr')
t1 = time.time()
idx = groups.col.idxmax()
print(round(time.time() - t1,1))
0.7
有没有一种方法可以比idxmax()更快地获取这些索引?
请注意,我对idx.values
感兴趣,我不介意丢掉idx.index()
idx
的{{1}}
答案 0 :(得分:4)
从我的角度来看,使用drop_duplicates
比groupby
idxmax
快大约八倍
%timeit df.sort_values(['gr','col']).drop_duplicates('gr',keep='last').index
10 loops, best of 3: 67.3 ms per loop
%timeit df.groupby('gr').col.idxmax()
1 loop, best of 3: 491 ms per loop
答案 1 :(得分:2)
from numba import njit
@njit
def idxmax_(bins, k, weights):
out = np.zeros(k, np.int64)
trk = np.zeros(k)
for i, w in enumerate(weights - (weights.min() - 1)):
b = bins[i]
if w > trk[b]:
trk[b] = w
out[b] = i
return np.sort(out)
def idxmax(df):
f, u = pd.factorize(df.gr)
return idxmax_(f, len(u), df.col.values)
idxmax(df)
array([ 156, 220, 258, ..., 499945, 499967, 499982])
请务必准备好函数以进行编译
idxmax(df.head())
然后计时
%timeit idxmax(df)
%timeit df.sort_values(['gr', 'col'], ascending=False).drop_duplicates('gr').index
6.07 ms ± 15.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
152 ms ± 498 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
idx0 = df.groupby('gr').col.idxmax().sort_values().values
idx1 = idxmax(df)
idx2 = df.sort_values(
['gr', 'col'],
ascending=False
).drop_duplicates('gr').index.sort_values().values
print((idx0 == idx1).all(), (idx0 == idx2).all(), sep='\n')
True
True