Question

我有一个User对象，有两个布尔属性，如下所示：

class User(object):
  def __init__(self, a, b):
    self.a = a  # Always a bool
    self.b = b  # Always a bool

我有一个名为user_list的对象列表，我希望获得有多少个对象的频率计数== True，a == False，b == True，b ==假

我最初的方法是使用collections.Counter，但这需要在列表中循环两次：

a_count = collections.Counter(u.a for u in user_list)
b_count = collections.Counter(u.b for u in user_list)
print a_count[True], a_count[False], b_count[True], b_count[False]

我还想过只使用4个计数器，但这很难看，并且感觉不到pythonic：

a_true_count = 0
a_false_count = 0
b_true_count = 0
b_false_count = 0
for u in user_list:
  if u.a:
    a_true_count += 1
  else:
    a_false_count += 1
  if u.b:
    b_true_count += 1
  else:
    a_false_count += 1
print a_true_count, a_false_count, b_true_count, b_false_count

有更有效的方法吗？输出可以是任何东西：4个单独的变量，带有值的字典，列表，元组，等等，只要它有4个值。

提前致谢！

Answer 1

我认为使用collections.Counter是正确的想法，只需使用单个Counter和单循环以更通用的方式进行：

from collections import Counter

user_list = [User(True, False), User(False, True), User(True, True), User(False, False)]
user_attr_count = Counter()

for user in user_list:
    user_attr_count['a_%s' % user.a] += 1
    user_attr_count['b_%s' % user.b] += 1

print user_attr_count
# Counter({'b_False': 2, 'a_True': 2, 'b_True': 2, 'a_False': 2})

Answer 2

为什么不使用两个计数器，并从user_list的长度中减去以找到其他两个值？

a_false_count = len(user_list) - a_true_count

b_false_count = len(user_list) - b_true_count

这样的显式循环可能是时间上最有效的解决方案，但是如果你正在寻找一些更简洁的代码，你可以试试filter()：

a_false_count = len(filter(lambda x: x.a,user_list))
b_false_count = len(filter(lambda x: x.b,user_list))

Answer 3

您可以使用位屏蔽：

def count(user_list,mask):
    return Counter((u.a<<1 | u.b)&mask for u in user_list)

a=0b10
b=0b01
aANDb=0b11
print count(user_list,aANDb)

Answer 4

from collections import Counter

c = Counter()
for u in user_list:
    c['a'] += u.a
    c['b'] += u.b

print c['a'], len(user_list) - c['a'], c['b'], len(user_list) - c['b']

Answer 5

这是一个与你第一次接近的解决方案，除了它只迭代列表一次。它创建了两个计数器，遍历列表，并为每个用户更新每个计数器。进行计数的实际步骤如下：

for user in user_list:
    a_count.update([user.a])
    b_count.update([user.b])

它使用更新功能更新每个计数器对象。你可以这样做，而不是像你在第一个例子中那样使用生成器在一行中创建计数器。整个代码示例如下：

import collections

class User(object):
    def __init__(self, a, b):
        self.a = a
        self.b = b

user_list = [
    User(True, False),
    User(False, True),
    User(True, True),
    User(False, False)
]

a_count = collections.Counter()
b_count = collections.Counter()

for user in user_list:
    a_count.update([user.a])
    b_count.update([user.b])


print a_count[True], a_count[False], b_count[True], b_count[False]

Answer 6

我喜欢使用zip和map来填充这些内容：

from collections import Counter
# for test, import random:
import random

# define class
class User(object):
  def __init__(self, a, b):
    self.a = a  # Always a bool
    self.b = b  # Always a bool

# create an arbitrary set
users = [ User( r % 2 == 0, r % 3 == 0 ) for r in (random.randint(0,100) for x in xrange(100)) ]

# and... count
aCounter, bCounter = map(Counter, zip(*((u.a, u.b) for u in users)))

<强>更新 map(sum, zip(*tuples))在较小的样本大小上略微快于for循环，但对于较大的样本大小，for循环的扩展要好得多.for for循环不会因为在元组列表上工作而获得很大的性能提升做其他方法。可能是因为它已经非常优化了。

collections.Counter仍然很慢。

import random
import itertools
import time
from collections import Counter 

# define class
class User(object):
  def __init__(self, a, b):
    self.a = a  # Always a bool
    self.b = b  # Always a bool

# create an arbitrary sample
users = [ User( r % 2 == 0, r % 3 == 0 ) for r in (random.randint(0,100) for x in xrange(100)) ]
# create a list of tuples of the arbitrary sample
users2 = [ ( u.a,u.b) for u in users ] 

# useful function-timer decorator           
def timer(times=1):
    def outer(fn):
        def wrapper(*args, **kwargs):
            t0 = time.time()
            for n in xrange(times):
                r = fn(*args, **kwargs)
            dt = time.time() - t0
            print '{} ran {} times in {} seconds with {:f} ops/sec'.format(fn.__name__, times, dt, times/dt)
            return r
        return wrapper
    return outer 

# now create the timeable functions         
n=10000
@timer(times=n)
def time_sum():
    return map(sum, zip(*((u.a, u.b) for u in users)))
@timer(times=n)
def time_counter():
    return map(Counter, zip(*((u.a, u.b) for u in users)))
@timer(times=n)
def time_for():
    a,b=0,0
    for u in users:
        if u.a is True:
            a += 1
        if u.b is True:
            b += 1
    return a,b
@timer(times=n)
def time_itermapzip():
    return list(itertools.imap(sum, itertools.izip(*((u.a, u.b) for u in users))))

@timer(times=n)
def time_sum2():
    return map(sum, zip(*users2))
@timer(times=n)
def time_counter2():
    return map(Counter, zip(*users2))
@timer(times=n)
def time_for2():
    a,b=0,0
    for _a,_b in users2:
        if _a is True:
            a += 1
        if _b is True:
            b += 1
    return a,b
@timer(times=n)
def time_itermapzip2():
    return list(itertools.imap(sum, itertools.izip(*users2))) 

v = time_sum()
v = time_counter()
v = time_for()
v = time_itermapzip()

v = time_sum2()
v= time_counter2()
v = time_for2()
v = time_itermapzip2() 

# time_sum ran 10000 times in 0.446894168854 seconds with 22376.662523 ops/sec
# time_counter ran 10000 times in 1.29836297035 seconds with 7702.006471 ops/sec
# time_for ran 10000 times in 0.267076015472 seconds with 37442.523554 ops/sec
# time_itermapzip ran 10000 times in 0.459508895874 seconds with 21762.364319 ops/sec
# time_sum2 ran 10000 times in 0.174293994904 seconds with 57374.323226 ops/sec
# time_counter2 ran 10000 times in 0.989939928055 seconds with  10101.623055 ops/sec
# time_for2 ran 10000 times in 0.183295965195 seconds with 54556.574605 ops/sec
# time_itermapzip2 ran 10000 times in 0.193426847458 seconds with 51699.131384 ops/sec

print "True a's: {}\t False a's: {}\nTrue b's: {}\t False b's:{}".format(v[0], len(users)-v[0], v[1], len(users)-v[1]) 
# True a's: 53   False a's: 47
# True b's: 31   False b's:69
v
# [53, 31]

样本大小为1000的相同代码：

# time_sum ran 10000 times in 9.30428719521 seconds with 1074.773359 ops/sec
# time_counter ran 10000 times in 16.7009849548 seconds with 598.767080 ops/sec
# time_for ran 10000 times in 2.61371207237 seconds with 3825.976130 ops/sec
# time_itermapzip ran 10000 times in 9.40824103355 seconds with 1062.897939 ops/sec
# time_sum2 ran 10000 times in 5.70988488197 seconds with 1751.348794 ops/sec
# time_counter2 ran 10000 times in 13.4643371105 seconds with 742.702735 ops/sec
# time_for2 ran 10000 times in 2.49017906189 seconds with 4015.775473 ops/sec
# time_itermapzip2 ran 10000 times in 6.10926699638 seconds with 1636.857581 ops/sec

Python：在对象列表中获得两个布尔属性频率的最有效方法？

6 个答案: