dimensions = ('product', 'place')
metrics = ('METRIC_1', 'METRIC_2')
input = [
{'product': 'eggs', 'place': 'fridge', 'METRIC_1': 1, 'METRIC_2': 2},
{'product': 'eggs', 'place': 'table', 'METRIC_1': 3, 'METRIC_2': 1},
{'product': 'ham', 'place': 'fridge', 'METRIC_1': 1, 'METRIC_2': 2},
{'product': 'ham', 'place': 'table', 'METRIC_1': 3, 'METRIC_2': 5},
]
对于每个维度和维度的所有组合,我希望获得“_all_”值,其中度量标准总结(或应用任何其他聚合方法)。
result = [
{'product': '_all_', 'place': 'fridge', 'METRIC_1': 2, 'METRIC_2': 4},
{'product': '_all_', 'place': 'table', 'METRIC_1': 6, 'METRIC_2': 6},
{'product': 'eggs', 'place': '_all_', 'METRIC_1': 4, 'METRIC_2': 3},
{'product': 'ham', 'place': '_all_', 'METRIC_1': 4, 'METRIC_2': 7},
{'product': '_all_', 'place': '_all_', 'METRIC_1': 8, 'METRIC_2': 8},
]
考虑维度和指标的数量是灵活的。 如果答案是具有以下签名的函数,将不胜感激:
calc_totals(input_list, dimensions_list, {'metric_1': 'sum', 'metric_2': 'sum'}):
pass
我的尝试如下,但似乎太复杂,不确定它是否正确:
from operator import itemgetter
from itertools import groupby, combinations, chain
def powerset(iterable):
xs = list(iterable)
return chain.from_iterable(combinations(xs, n) for n in range(len(xs)+1))
def calc_totals(input, dimensions):
totals = []
dim_combs = list(powerset(dimensions))[1:-1]
for dim_comb in dim_combs:
current_dims = dimensions.difference(set(dim_comb))
grouper = itemgetter(*current_dims)
for key, group in groupby(sorted(input, key=grouper), grouper):
temp_dict = dict(zip(list(current_dims), [key]))
temp_dict['METRIC_1'] = 0
temp_dict['METRIC_2'] = 0
for item in group:
temp_dict['METRIC_1'] += item['METRIC_1']
temp_dict['METRIC_2'] += item['METRIC_2']
for dim in dim_comb:
temp_dict[dim] = '_all_'
totals.append(temp_dict)
return totals
答案 0 :(得分:0)
这是我提出的代码。它将聚合函数的输入,维度和字典作为参数。然后迭代输入中的每一行,并将指标聚合到输出中内部为dict的每个相关行。最后,结果dict被展平以生成列表输出:
from itertools import combinations, chain, product
from collections import defaultdict
from operator import add
from pprint import pprint
dimensions = ('product', 'place')
src = [
{'product': 'eggs', 'place': 'fridge', 'METRIC_1': 1, 'METRIC_2': 2},
{'product': 'eggs', 'place': 'table', 'METRIC_1': 3, 'METRIC_2': 1},
{'product': 'ham', 'place': 'fridge', 'METRIC_1': 1, 'METRIC_2': 2},
{'product': 'ham', 'place': 'table', 'METRIC_1': 3, 'METRIC_2': 5},
]
def flatten(keys, d, level=0, cur={}):
if level == len(keys):
cur.update(d)
yield cur.copy()
else:
for k, v in d.items():
cur[keys[level]] = k
for x in flatten(keys, v, level + 1, cur):
yield x
del cur[keys[level]]
def calc_totals(input_list, dimension_list, aggregate):
if not input_list:
return []
# Autovivification dict to store results
dd = lambda: defaultdict(dd)
result = dd()
# Tuple of combos where each combo is a tuple of dimensions that are aggregated
combos = tuple(chain.from_iterable(combinations(dimension_list, n) for n in range(1, len(dimension_list) + 1)))
# For every row in source
for row in src:
# For every possible combo
for combo in combos:
target = result
# Navigate to dict where metric should be added automatically generating empty dict
# if one doesn't exist
for dim in dimensions:
key = '_all_' if dim in combo else row[dim]
target = target[key]
# Add metrics, call aggregate function combine with existing value using 0 as default
for metric, func in aggregate.items():
target[metric] = func(target.get(metric, 0), row[metric])
# Finally flatten the results to a list
return list(flatten(dimension_list, result))
pprint(calc_totals(src, dimensions, {'METRIC_1': add, 'METRIC_2': add}))
输出:
[{'METRIC_1': 4, 'METRIC_2': 7, 'place': '_all_', 'product': 'ham'},
{'METRIC_1': 8, 'METRIC_2': 10, 'place': '_all_', 'product': '_all_'},
{'METRIC_1': 2, 'METRIC_2': 4, 'place': 'fridge', 'product': '_all_'},
{'METRIC_1': 6, 'METRIC_2': 6, 'place': 'table', 'product': '_all_'},
{'METRIC_1': 4, 'METRIC_2': 3, 'place': '_all_', 'product': 'eggs'}]
只要维度和集合函数作为参数提供,它就应该支持任意数量的维度和指标。
答案 1 :(得分:0)
def powerset(iterable):
xs = list(iterable)
return chain.from_iterable(combinations(xs, n) for n in range(len(xs)+1))
def calc_totals(input_list, dimensions, metric_func_dict):
# metric_func_dict = {'METRIC_1': 'sum', 'METRIC_2': 'mean'}
dimensions = set(dimensions)
totals = []
dim_combs = list(powerset(dimensions))[1:-1]
for dim_comb in dim_combs:
current_dims = dimensions.difference(set(dim_comb))
grouper = itemgetter(*current_dims)
for key, group in groupby(sorted(input_list, key=grouper), grouper):
if type(key) == str:
temp_dict = dict(zip(list(current_dims), [key]))
else:
temp_dict = dict(zip(list(current_dims), key))
for metric in metric_func_dict:
temp_dict[metric] = []
for item in group:
for metric in metric_func_dict:
temp_dict[metric].append(item[metric])
for metric in metric_func_dict:
method_to_call = getattr(np, metric_func_dict[metric])
temp_dict[metric] = method_to_call(temp_dict[metric])
for dim in dim_comb:
temp_dict[dim] = '_all_'
totals.append(temp_dict)
return totals
另外,我相信,使用pandas可能有更好的解决方案