Python中不同嵌套字典的Gen填充率?

时间:2018-01-25 22:09:21

标签: python loops counting

我有一个在Python中接收嵌套字典的进程:

示例嵌套字典架构(伪代码)

key1: value1,
key2: dict(
  key3: value2,
  key4: value3,
),
key5: list(value4,value5) # any value is fine, just not empty or null

示例嵌套字典数据(伪代码)

key1: 'value',
key2: dict(
  key3: '',
  key4: 12345,
),
key5: list()

我想迭代/扫描这个dict并检查每个键是否有值(不是Null或空白 - false / 0都可以)。我需要扫描一堆完全相同的词汇来获得整体的#34;填充率"对于那套dicts。该流程每次运行时都会看到不同格式的dicts集,因此需要自动生成填充率报告:

上面的单个嵌套示例的示例填充率(理想情况下是平坦的字典):

key1: 1
key2: 1
key2-key3: 0
key2-key4: 1
key5: 0

例如,如果我们扫描了十个相同结构的序列,我们可能会看到:填充率"像这样:

key1: 5
key2: 6
key2-key3: 6
key2-key4: 4
key5: 3

问题

  1. 扫描不同结构的dicts以获得填充率的最pythonic方法是什么?如果我必须这样做数百万次,是否有更有效的方法?

  2. 创建平面字典以存储计数以及如何更新它的最pythonic方法是什么?

4 个答案:

答案 0 :(得分:1)

这是我的看法:

  

根据填充率扫描不同结构的dicts的最pythonic方法是什么?

递归。特别是,我将walked子树的结果返回给调用者。调用者负责将多个子树合并到其树的结果中。

  

如果我必须这样做数百万次,是否有更有效的方法?

可能。尝试一个解决方案,看看它是否是A)正确和B)足够快。如果两者兼而有之,请不要费心寻找效率。

  

创建平面字典以存储计数以及如何更新它的最pythonic方法是什么?

使用Python附带的一个库。在这种情况下,#include <stdio.h> #include <omp.h> #include <math.h> #include "immintrin.h" #include <assert.h> #define F_E_Q(X,Y,N) (round((X) * pow(10, N)-(Y) * pow(10, N)) == 0) void reference(float* a, float* b, float* c, int n, int nPadded); void intrinsics(float* a, float* b, float* c, int n, int nPadded); char *test(){ int n=4800; int nPadded = n; assert(n%16 == 0); float* a = (float*) _mm_malloc(sizeof(float)*n*nPadded, 64); float* b = (float*) _mm_malloc(sizeof(float)*n*nPadded, 64); float* cRef = (float*) _mm_malloc(sizeof(float)*n*nPadded, 64); float* c = (float*) _mm_malloc(sizeof(float)*n*nPadded, 64); assert(a != NULL); assert(b != NULL); assert(cRef != NULL); assert(c != NULL); for(int i=0, max = n*nPadded; i<max; i++){ a[i] = (int) rand() / 1804289408.0; b[i] = (int) rand() / 1804289408.0; cRef[i] = 0.0; c[i] = 0.0; } debug_arr("a", "%f", a, 0, 9, 1); debug_arr("b", "%f", b, 0, 9, 1); debug_arr("cRef", "%f", cRef, 0, 9, 1); debug_arr("c", "%f", c, 0, 9, 1); double t1 = omp_get_wtime(); reference(a, b, cRef, n, nPadded); double t2 = omp_get_wtime(); debug("reference calc time: %f", t2-t1); t1 = omp_get_wtime(); intrinsics(a, b, c, n, nPadded); t2 = omp_get_wtime(); debug("Intrinsics calc time: %f", t2-t1); debug_arr("cRef", "%f", cRef, 0, 9, 1); debug_arr("c", "%f", c, 0, 9, 1); for(int i=0, max = n*nPadded; i<max; i++){ assert(F_E_Q(cRef[i], c[i], 2)); } _mm_free(a); _mm_free(b); _mm_free(cRef); _mm_free(c); return NULL; } void reference(float* a, float* b, float* c, int n, int nPadded){ for(int i = 0; i < n; i++ ) for(int k = 0; k < n; k++ ) for(int j = 0; j < n; j++ ) c[i*nPadded+j] = c[i*nPadded+j] + a[i*nPadded+k]*b[k*nPadded+j]; } #if __MIC__ void intrinsics(float* a, float* b, float* c, int n, int nPadded){ #pragma omp parallel for for(int i = 0; i < n; i++ ) for(int k = 0; k < n; k++ ) for(int j = 0; j < n; j+=16 ){ __m512 aPart = _mm512_extload_ps(a + i*nPadded+k, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE); __m512 bPart = _mm512_load_ps(b + k*nPadded+j); __m512 cPart = _mm512_load_ps(c + i*nPadded+j); cPart = _mm512_add_ps(cPart, _mm512_mul_ps(aPart, bPart)); _mm512_store_ps(c + i*nPadded+j, cPart); } } #else void intrinsics(float* a, float* b, float* c, int n, int nPadded){ #pragma omp parallel for for(int i = 0; i < n; i++ ) for(int k = 0; k < n; k++ ) for(int j = 0; j < n; j+=4 ){ __m128 aPart = _mm_load_ps1(a + i*nPadded+k); __m128 bPart = _mm_load_ps(b + k*nPadded+j); __m128 cPart = _mm_load_ps(c + i*nPadded+j); cPart = _mm_add_ps(cPart, _mm_mul_ps(aPart, bPart)); _mm_store_ps(c + i*nPadded+j, cPart); } } #endif 。并通过调用其collections.Counter()函数。

.update()

结果:

from collections import Counter
from pprint import pprint

example1_dict = {
    'key1': 'value',
    'key2': {
        'key3': '',
        'key4': 12345,
    },
    'key5': list()
}

example2_dict = {
    'key1': 'value',
    'key7': {
        'key3': '',
        'key4': 12345,
    },
    'key5': [1]
}

def get_fill_rate(d, path=()):
    result = Counter()
    for k, v in d.items():
        if isinstance(v, dict):
            result[path+(k,)] += 1
            result.update(get_fill_rate(v, path+(k,)))
        elif v in (False, 0):
            result[path+(k,)] += 1
        elif v:
            result[path+(k,)] += 1
        else:
            result[path+(k,)] += 0
    return result

def get_fill_rates(l):
    result = Counter()
    for d in l:
        result.update(get_fill_rate(d))
    return dict(result)

result = get_fill_rates([example1_dict, example2_dict])

# Raw result
pprint(result)

# Formatted result
print('\n'.join(
    '-'.join(single_key for single_key in key) + ': ' + str(value)
    for key, value in sorted(result.items())))

答案 1 :(得分:0)

好的,我想我解决了。我做了一些非常小的测试,但我觉得这很有效:

def scan_dict(d):
    counts = {}
    for k, v in d.items():
        if isinstance(v, dict):
            subcounts = scan_dict(v)
            for subkey, subcount in subcounts.items():
                new_key = str(k) + "-" + str(subkey)
                count = counts.get(new_key, 0)
                counts[new_key] = count + subcount
        key = str(k)
        count = counts.get(key, 0)
        counts[key] = count + 1
    return counts

def scan_all_dicts(ds):
    total_counts = {}
    for d in ds:
        counts = scan_dict(d)
        for k, v in counts.items():
            count = total_counts.get(k, 0)
            total_counts[k] = count + v
    return total_counts

基本上,有一个递归函数可以扫描每个字典并计算其中的所有内容,并且任何孩子都可以查找它。

“driver”是第二个函数,它接受一个可迭代的(例如列表)dicts并在第一个函数中运行它们,然后返回所有值的扁平列表。

我没有检查这些值以确保它们“不是空白”;我会把它留给你。

答案 2 :(得分:0)

递归是解决这个问题的最恐怖的方式;但是,此解决方案利用装饰器更新全局字典以存储整体填充率。使用collections.defaultdict时,final_dict的每次换行都可以多次更新get_occurences

from collections import defaultdict
import re
final_dict = defaultdict(int)
def fill(f):
   def update_count(structure, last):
     data = f(structure, last=None)
     def update_final(d):
        for a, b in d.items():
            global final_dict
            final_dict[a] += int(bool(b)) if not isinstance(b, dict) else int(bool(update_final(b)))
      update_final(data)
   return update_count

@fill
def get_occurences(d, last=None):
   return {"{}-{}".format(last, a) if last else a:int(bool(b)) if not isinstance(b, dict) else get_occurences(b, a) for a, b in d.items()}

structures = [{'key1':'value', 'key2':{'key3':'', 'key4':12345}, 'key5':[]}, {'key1':18, 'key2':'value1', 'key3':['James', 'Bob', 'Bill']},{'key1':'value2', 'key2':{'key3':'233', 'key4':12345}, 'key5':100}]
for structure in structures:
   get_occurences(structure)

for i in sorted(final_dict.items(), key=lambda (c, d):(int(re.findall('\d+$', c)[0]), bool(re.findall('\w+-\w+', c)))):
  print("{}: {}".format(*i))

输出:

{'key2-key3': 1, 'key2-key4': 2, 'key1': 3, 'key2': 1, 'key5': 1, 'key3': 1}

输出:

key1: 3
key2: 1
key3: 1
key2-key3: 1
key2-key4: 2
key5: 1

答案 3 :(得分:0)

您可以尝试这样的事情:

example1_dict = {
    'key1': 'value',
    'key2': {
        'key3': '',
        'key4': 12345,
    },
    'key5': list()
}



example={}
for ka,l in example1_dict.items():
    if isinstance(l,dict):
        def hi(fg, track=''):
            print(fg)
            for i, k in fg.items():
                track="{}-{}".format(ka,i)
                if i not in example:
                    example[track] = 1
                else:
                    example[track] += 1
                if isinstance(k, dict):
                    return hi(k)



        print(hi(l))
    elif l:
        example[ka]=1
    else:
        example[ka]=0
print(example)

输出:

{'key5': 0, 'key2-key4': 1, 'key1': 1, 'key2-key3': 1}