蟒蛇。计算字典列表的维度总计

时间:2016-07-11 07:42:49

标签: python

我有2个维度:

dimensions = ('product', 'place')

和2个指标:

metrics = ('METRIC_1', 'METRIC_2')

输入是具有维度和指标的以下列表

input = [
    {'product': 'eggs', 'place': 'fridge', 'METRIC_1': 1, 'METRIC_2': 2},
    {'product': 'eggs', 'place': 'table', 'METRIC_1': 3, 'METRIC_2': 1},
    {'product': 'ham', 'place': 'fridge', 'METRIC_1': 1, 'METRIC_2': 2},
    {'product': 'ham', 'place': 'table', 'METRIC_1': 3, 'METRIC_2': 5},
]

对于每个维度和维度的所有组合,我希望获得“_all_”值,其中度量标准总结(或应用任何其他聚合方法)。

预期结果:

result = [
    {'product': '_all_', 'place': 'fridge', 'METRIC_1': 2, 'METRIC_2': 4},
    {'product': '_all_', 'place': 'table', 'METRIC_1': 6, 'METRIC_2': 6},
    {'product': 'eggs', 'place': '_all_', 'METRIC_1': 4, 'METRIC_2': 3},
    {'product': 'ham', 'place': '_all_', 'METRIC_1': 4, 'METRIC_2': 7},
    {'product': '_all_', 'place': '_all_', 'METRIC_1': 8, 'METRIC_2': 8},
]

考虑维度和指标的数量是灵活的。 如果答案是具有以下签名的函数,将不胜感激:

calc_totals(input_list, dimensions_list, {'metric_1': 'sum', 'metric_2': 'sum'}):
    pass

我的尝试如下,但似乎太复杂,不确定它是否正确:

from operator import itemgetter
from itertools import groupby, combinations, chain


def powerset(iterable):
    xs = list(iterable)
    return chain.from_iterable(combinations(xs, n) for n in range(len(xs)+1))


def calc_totals(input, dimensions):
    totals = []
    dim_combs = list(powerset(dimensions))[1:-1]
    for dim_comb in dim_combs:
        current_dims = dimensions.difference(set(dim_comb))
        grouper = itemgetter(*current_dims)
        for key, group in groupby(sorted(input, key=grouper), grouper):
            temp_dict = dict(zip(list(current_dims), [key]))

            temp_dict['METRIC_1'] = 0
            temp_dict['METRIC_2'] = 0
            for item in group:
                temp_dict['METRIC_1'] += item['METRIC_1']
                temp_dict['METRIC_2'] += item['METRIC_2']

            for dim in dim_comb:
                temp_dict[dim] = '_all_'
            totals.append(temp_dict)
    return totals

2 个答案:

答案 0 :(得分:0)

这是我提出的代码。它将聚合函数的输入,维度和字典作为参数。然后迭代输入中的每一行,并将指标聚合到输出中内部为dict的每个相关行。最后,结果dict被展平以生成列表输出:

from itertools import combinations, chain, product
from collections import defaultdict
from operator import add
from pprint import pprint

dimensions = ('product', 'place')

src = [
    {'product': 'eggs', 'place': 'fridge', 'METRIC_1': 1, 'METRIC_2': 2},
    {'product': 'eggs', 'place': 'table', 'METRIC_1': 3, 'METRIC_2': 1},
    {'product': 'ham', 'place': 'fridge', 'METRIC_1': 1, 'METRIC_2': 2},
    {'product': 'ham', 'place': 'table', 'METRIC_1': 3, 'METRIC_2': 5},
]

def flatten(keys, d, level=0, cur={}):
    if level == len(keys):
        cur.update(d)
        yield cur.copy()
    else:
        for k, v in d.items():
            cur[keys[level]] = k
            for x in flatten(keys, v, level + 1, cur):
                yield x
        del cur[keys[level]]

def calc_totals(input_list, dimension_list, aggregate):
    if not input_list:
        return []

    # Autovivification dict to store results
    dd = lambda: defaultdict(dd)
    result = dd()

    # Tuple of combos where each combo is a tuple of dimensions that are aggregated
    combos = tuple(chain.from_iterable(combinations(dimension_list, n) for n in range(1, len(dimension_list) + 1)))

    # For every row in source
    for row in src:
        # For every possible combo
        for combo in combos:
            target = result
            # Navigate to dict where metric should be added automatically generating empty dict
            # if one doesn't exist
            for dim in dimensions:
                key = '_all_' if dim in combo else row[dim]
                target = target[key]
            # Add metrics, call aggregate function combine with existing value using 0 as default
            for metric, func in aggregate.items():
                target[metric] = func(target.get(metric, 0), row[metric])

    # Finally flatten the results to a list
    return list(flatten(dimension_list, result))

pprint(calc_totals(src, dimensions, {'METRIC_1': add, 'METRIC_2': add}))

输出:

[{'METRIC_1': 4, 'METRIC_2': 7, 'place': '_all_', 'product': 'ham'},
 {'METRIC_1': 8, 'METRIC_2': 10, 'place': '_all_', 'product': '_all_'},
 {'METRIC_1': 2, 'METRIC_2': 4, 'place': 'fridge', 'product': '_all_'},
 {'METRIC_1': 6, 'METRIC_2': 6, 'place': 'table', 'product': '_all_'},
 {'METRIC_1': 4, 'METRIC_2': 3, 'place': '_all_', 'product': 'eggs'}]

只要维度和集合函数作为参数提供,它就应该支持任意数量的维度和指标。

答案 1 :(得分:0)

@niemmi,谢谢。你的np.mean聚合失败了,所以让我添加适合我的解决方案。

def powerset(iterable):
    xs = list(iterable)
    return chain.from_iterable(combinations(xs, n) for n in range(len(xs)+1))  

def calc_totals(input_list, dimensions, metric_func_dict):
    # metric_func_dict = {'METRIC_1': 'sum', 'METRIC_2': 'mean'}
    dimensions = set(dimensions)
    totals = []
    dim_combs = list(powerset(dimensions))[1:-1]

    for dim_comb in dim_combs:
        current_dims = dimensions.difference(set(dim_comb))
        grouper = itemgetter(*current_dims)
        for key, group in groupby(sorted(input_list, key=grouper), grouper):
            if type(key) == str:
                temp_dict = dict(zip(list(current_dims), [key]))
            else:
                temp_dict = dict(zip(list(current_dims), key))

            for metric in metric_func_dict:
                temp_dict[metric] = []
            for item in group:
                for metric in metric_func_dict:
                    temp_dict[metric].append(item[metric])
            for metric in metric_func_dict:
                method_to_call = getattr(np, metric_func_dict[metric])
                temp_dict[metric] = method_to_call(temp_dict[metric])

            for dim in dim_comb:
                temp_dict[dim] = '_all_'
            totals.append(temp_dict)
    return totals

另外,我相信,使用pandas可能有更好的解决方案