Question

我需要一个解决方案，用于从非唯一列表中提取唯一元素并计算其重复元素。

解决方案的目的是在算法中使用它来从非唯一列表创建唯一组合。在这种情况下创建组合的列表大小通常非常小（少于50个元素），但我的目标是找到尝试随时随地优化的最快代码（即使只获得非常少量的运行时间））。

Pythons collections模块提供了一个适合此类目的的专用collections.Counter，但有些情况下，使用简单字典代替collections.Counter会导致更快的代码，就像你可以使用以下代码查看自己：

from time   import time   as t
from timeit import timeit as tt

from collections import Counter
def counter(iterable):
    dctCounter = {}
    for item in iterable:
        if item in dctCounter: 
            dctCounter[item] += 1
        else: 
            dctCounter[item]  = 1
    return dctCounter

for n, N in [(1,10), (10,1), (1,50), (50,1), (1,100), (100,1), (1,200), (200, 1), (1, 500), (500, 1), (1, 1000), (1000,1)]:
    lstItems = n*list(range(N))
    for noLoops in [10**p for p in range(5, 6)]: 
        s = t()
        for _ in range(noLoops): 
            dctCounter = counter(lstItems)
        e = t()
        timeDctFctn = e - s
        s = t()
        for _ in range(noLoops): 
            objCounter = Counter(lstItems)
        e = t()
        timeCollCtr = e - s
        timeitCollCtr = tt("objCounter=Counter(lstItems)", "from __main__ import Counter, lstItems", number=noLoops)
        timeitDctFctn = tt("dctCounter=counter(lstItems)", "from __main__ import counter, lstItems", number=noLoops)
        # print("Loops: {:7}, time/timeit CollCtr: {:7.5f}/{:7.5f} DctFctn: {:7.5f}/{:7.5f} sec. lstSize: {:3}, %uniq: {:3.0f}".format(noLoops, timeCollCtr, timeitCollCtr, timeDctFctn, timeitDctFctn, n*N, 100.0/n))
        print("collections.Counter(): {:7.5f},  def counter(): {:7.5f} sec. lstSize: {:3}, %uniq: {:3.0f}, ({} timitLoops)".format(timeitCollCtr, timeitDctFctn, n*N, 100.0/n, noLoops))    
        # print('-----------------------------------------------------------------------------------------------------------')

这里输出：

python3.6 -u "collections.Counter-vs-dictionaryAsCounter_Cg.py"
collections.Counter(): 0.36461, def counter(): 0.09592 sec. lstSize:  10, %uniq: 100, (100000 timitLoops)
collections.Counter(): 0.36444, def counter(): 0.12286 sec. lstSize:  10, %uniq:  10, (100000 timitLoops)
collections.Counter(): 0.58627, def counter(): 0.43233 sec. lstSize:  50, %uniq: 100, (100000 timitLoops)
collections.Counter(): 0.52399, def counter(): 0.54106 sec. lstSize:  50, %uniq:   2, (100000 timitLoops)
collections.Counter(): 0.82332, def counter(): 0.81436 sec. lstSize: 100, %uniq: 100, (100000 timitLoops)
collections.Counter(): 0.72513, def counter(): 1.06823 sec. lstSize: 100, %uniq:   1, (100000 timitLoops)
collections.Counter(): 1.27130, def counter(): 1.59476 sec. lstSize: 200, %uniq: 100, (100000 timitLoops)
collections.Counter(): 1.13817, def counter(): 2.14566 sec. lstSize: 200, %uniq:   0, (100000 timitLoops)
collections.Counter(): 3.16287, def counter(): 4.26738 sec. lstSize: 500, %uniq: 100, (100000 timitLoops)
collections.Counter(): 2.64247, def counter(): 5.67448 sec. lstSize: 500, %uniq:   0, (100000 timitLoops)
collections.Counter(): 4.89153, def counter(): 7.68661 sec. lstSize:1000, %uniq: 100, (100000 timitLoops)
collections.Counter(): 6.06389, def counter():13.92613 sec. lstSize:1000, %uniq:   0, (100000 timitLoops)
>Exit code: 0

P.S。似乎collections.Counter（）在上面的另一个上下文中也达不到预期。见这里：https://stackoverflow.com/questions/41594940/why-is-collections-counter-so-slow

Answer 1

当你计算短迭代次数时，

Counter会遇到一个主要瓶颈：它会检查if isinstance(iterable, Mapping)。此测试相当慢，因为collections.abc.Mapping是abstract metaclass，因此isinstance - 检查比普通isinstance检查要复杂一些 - 检查，例如：{{ 3}}

因此，对于短迭代，其他方法更快，这并不令人惊讶。但是对于 long iterables ，检查并不重要，Counter应该更快（至少对于实际计数函数{{1}的python-3.x（CPython）） }写在"Why is checking isinstance(something, Mapping) so slow?"）。

识别瓶颈的一种简单方法是分析。我在这里使用c：

_count_elements

结果：

%load_ext line_profiler

from collections import Counter
x = range(50)
# Profile the function Counter.update when executing the command "Counter(x)"
%lprun -f Counter.update Counter(x)

因此，从迭代中初始化Line # Hits Time Per Hit % Time Line Contents ============================================================== 604 1 8 8.0 3.9 if not args: 605 raise TypeError("descriptor 'update' of 'Counter' object " 606 "needs an argument") 607 1 13 13.0 6.4 self, *args = args 608 1 6 6.0 3.0 if len(args) > 1: 609 raise TypeError('expected at most 1 arguments, got %d' % len(args)) 610 1 5 5.0 2.5 iterable = args[0] if args else None 611 1 3 3.0 1.5 if iterable is not None: 612 1 94 94.0 46.3 if isinstance(iterable, Mapping): 613 if self: 614 self_get = self.get 615 for elem, count in iterable.items(): 616 self[elem] = count + self_get(elem, 0) 617 else: 618 super(Counter, self).update(iterable) # fast path when counter is empty 619 else: 620 1 69 69.0 34.0 _count_elements(self, iterable) 621 1 5 5.0 2.5 if kwds: 622 self.update(kwds)所需的时间有一个相当大的常数因素（46％的时间用于Counter检查，只需要获取带有计数的字典34％）。

然而，对于长时间的迭代，它并不重要（因为它只做了一次）：

isinstance

为了让您了解这些元素的执行方式，具体取决于元素的数量，为了进行比较，我提供了计数的优化版本以及%lprun -f Counter.update Counter([1]*100000) Line # Hits Time Per Hit % Time Line Contents ============================================================== 604 1 12 12.0 0.0 if not args: 605 raise TypeError("descriptor 'update' of 'Counter' object " 606 "needs an argument") 607 1 12 12.0 0.0 self, *args = args 608 1 6 6.0 0.0 if len(args) > 1: 609 raise TypeError('expected at most 1 arguments, got %d' % len(args)) 610 1 6 6.0 0.0 iterable = args[0] if args else None 611 1 3 3.0 0.0 if iterable is not None: 612 1 97 97.0 0.3 if isinstance(iterable, Mapping): 613 if self: 614 self_get = self.get 615 for elem, count in iterable.items(): 616 self[elem] = count + self_get(elem, 0) 617 else: 618 super(Counter, self).update(iterable) # fast path when counter is empty 619 else: 620 1 28114 28114.0 99.5 _count_elements(self, iterable) 621 1 13 13.0 0.0 if kwds: 622 self.update(kwds)使用的_count_elements函数。但是，我排除了您Counter项目的部分，并创建了一个计数列表以避免其他影响 - 特别是因为sorted具有不同的运行时行为（sorted）而不是计数（ O(n log(n))）：

O(n)

结果：

line_profiler

个人时间：

# Setup

import random
from collections import Counter, _count_elements

def count(iterable):
    """Explicit iteration over items."""
    dctCounter = {}
    for item in iterable:
        if item in dctCounter: 
            dctCounter[item] += 1
        else: 
            dctCounter[item]  = 1
    return dctCounter

def count2(iterable):
    """Iterating over the indices"""
    dctCounter = {}
    lenLstItems = len(iterable)
    for idx in range(lenLstItems):
        item = iterable[idx]
        if item in dctCounter.keys(): 
            dctCounter[item] += 1
        else: 
            dctCounter[item]  = 1
    return dctCounter

def c_count(iterable):
    """Internal counting function that's used by Counter"""
    d = {}
    _count_elements(d, iterable)
    return d

# Timing

timings = {Counter: [], count: [], count2: [], c_count: []}

for i in range(1, 20):
    print(2**i)
    it = [random.randint(0, 2**i) for _ in range(2**i)]
    for func in (Counter, count, count2, c_count):
        res = %timeit -o func(it)
        timings[func].append(res)

# Plotting

%matplotlib notebook

import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure(1)
ax = plt.subplot(111)

n = 2**np.arange(1, 5)

ax.plot(n, 
        [time.average for time in timings[count]], 
        label='my custom function', c='red')
ax.plot(n, 
        [time.average for time in timings[count2]], 
        label='your custom function', c='green')
ax.plot(n, 
        [time.average for time in timings[Counter]], 
        label='Counter', c='blue')
ax.plot(n, 
        [time.average for time in timings[c_count]], 
        label='_count_elements', c='purple')
ax.set_xscale('log')
ax.set_yscale('log')
ax.set_xlabel('elements')
ax.set_ylabel('time to count them [seconds]')
ax.grid(which='both')
ax.legend()
plt.tight_layout()

为什么字典在某些情况下会比集合更快.Counter？

1 个答案: