我有一个在Python中接收嵌套字典的进程:
示例嵌套字典架构(伪代码)
key1: value1,
key2: dict(
key3: value2,
key4: value3,
),
key5: list(value4,value5) # any value is fine, just not empty or null
示例嵌套字典数据(伪代码)
key1: 'value',
key2: dict(
key3: '',
key4: 12345,
),
key5: list()
我想迭代/扫描这个dict并检查每个键是否有值(不是Null或空白 - false / 0都可以)。我需要扫描一堆完全相同的词汇来获得整体的#34;填充率"对于那套dicts。该流程每次运行时都会看到不同格式的dicts集,因此需要自动生成填充率报告:
上面的单个嵌套示例的示例填充率(理想情况下是平坦的字典):
key1: 1
key2: 1
key2-key3: 0
key2-key4: 1
key5: 0
例如,如果我们扫描了十个相同结构的序列,我们可能会看到:填充率"像这样:
key1: 5
key2: 6
key2-key3: 6
key2-key4: 4
key5: 3
问题
扫描不同结构的dicts以获得填充率的最pythonic方法是什么?如果我必须这样做数百万次,是否有更有效的方法?
创建平面字典以存储计数以及如何更新它的最pythonic方法是什么?
答案 0 :(得分:1)
这是我的看法:
根据填充率扫描不同结构的dicts的最pythonic方法是什么?
递归。特别是,我将walked子树的结果返回给调用者。调用者负责将多个子树合并到其树的结果中。
如果我必须这样做数百万次,是否有更有效的方法?
可能。尝试一个解决方案,看看它是否是A)正确和B)足够快。如果两者兼而有之,请不要费心寻找效率。
创建平面字典以存储计数以及如何更新它的最pythonic方法是什么?
使用Python附带的一个库。在这种情况下,#include <stdio.h>
#include <omp.h>
#include <math.h>
#include "immintrin.h"
#include <assert.h>
#define F_E_Q(X,Y,N) (round((X) * pow(10, N)-(Y) * pow(10, N)) == 0)
void reference(float* a, float* b, float* c, int n, int nPadded);
void intrinsics(float* a, float* b, float* c, int n, int nPadded);
char *test(){
int n=4800;
int nPadded = n;
assert(n%16 == 0);
float* a = (float*) _mm_malloc(sizeof(float)*n*nPadded, 64);
float* b = (float*) _mm_malloc(sizeof(float)*n*nPadded, 64);
float* cRef = (float*) _mm_malloc(sizeof(float)*n*nPadded, 64);
float* c = (float*) _mm_malloc(sizeof(float)*n*nPadded, 64);
assert(a != NULL);
assert(b != NULL);
assert(cRef != NULL);
assert(c != NULL);
for(int i=0, max = n*nPadded; i<max; i++){
a[i] = (int) rand() / 1804289408.0;
b[i] = (int) rand() / 1804289408.0;
cRef[i] = 0.0;
c[i] = 0.0;
}
debug_arr("a", "%f", a, 0, 9, 1);
debug_arr("b", "%f", b, 0, 9, 1);
debug_arr("cRef", "%f", cRef, 0, 9, 1);
debug_arr("c", "%f", c, 0, 9, 1);
double t1 = omp_get_wtime();
reference(a, b, cRef, n, nPadded);
double t2 = omp_get_wtime();
debug("reference calc time: %f", t2-t1);
t1 = omp_get_wtime();
intrinsics(a, b, c, n, nPadded);
t2 = omp_get_wtime();
debug("Intrinsics calc time: %f", t2-t1);
debug_arr("cRef", "%f", cRef, 0, 9, 1);
debug_arr("c", "%f", c, 0, 9, 1);
for(int i=0, max = n*nPadded; i<max; i++){
assert(F_E_Q(cRef[i], c[i], 2));
}
_mm_free(a);
_mm_free(b);
_mm_free(cRef);
_mm_free(c);
return NULL;
}
void reference(float* a, float* b, float* c, int n, int nPadded){
for(int i = 0; i < n; i++ )
for(int k = 0; k < n; k++ )
for(int j = 0; j < n; j++ )
c[i*nPadded+j] = c[i*nPadded+j] + a[i*nPadded+k]*b[k*nPadded+j];
}
#if __MIC__
void intrinsics(float* a, float* b, float* c, int n, int nPadded){
#pragma omp parallel for
for(int i = 0; i < n; i++ )
for(int k = 0; k < n; k++ )
for(int j = 0; j < n; j+=16 ){
__m512 aPart = _mm512_extload_ps(a + i*nPadded+k, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, _MM_HINT_NONE);
__m512 bPart = _mm512_load_ps(b + k*nPadded+j);
__m512 cPart = _mm512_load_ps(c + i*nPadded+j);
cPart = _mm512_add_ps(cPart, _mm512_mul_ps(aPart, bPart));
_mm512_store_ps(c + i*nPadded+j, cPart);
}
}
#else
void intrinsics(float* a, float* b, float* c, int n, int nPadded){
#pragma omp parallel for
for(int i = 0; i < n; i++ )
for(int k = 0; k < n; k++ )
for(int j = 0; j < n; j+=4 ){
__m128 aPart = _mm_load_ps1(a + i*nPadded+k);
__m128 bPart = _mm_load_ps(b + k*nPadded+j);
__m128 cPart = _mm_load_ps(c + i*nPadded+j);
cPart = _mm_add_ps(cPart, _mm_mul_ps(aPart, bPart));
_mm_store_ps(c + i*nPadded+j, cPart);
}
}
#endif
。并通过调用其collections.Counter()
函数。
.update()
结果:
from collections import Counter
from pprint import pprint
example1_dict = {
'key1': 'value',
'key2': {
'key3': '',
'key4': 12345,
},
'key5': list()
}
example2_dict = {
'key1': 'value',
'key7': {
'key3': '',
'key4': 12345,
},
'key5': [1]
}
def get_fill_rate(d, path=()):
result = Counter()
for k, v in d.items():
if isinstance(v, dict):
result[path+(k,)] += 1
result.update(get_fill_rate(v, path+(k,)))
elif v in (False, 0):
result[path+(k,)] += 1
elif v:
result[path+(k,)] += 1
else:
result[path+(k,)] += 0
return result
def get_fill_rates(l):
result = Counter()
for d in l:
result.update(get_fill_rate(d))
return dict(result)
result = get_fill_rates([example1_dict, example2_dict])
# Raw result
pprint(result)
# Formatted result
print('\n'.join(
'-'.join(single_key for single_key in key) + ': ' + str(value)
for key, value in sorted(result.items())))
答案 1 :(得分:0)
好的,我想我解决了。我做了一些非常小的测试,但我觉得这很有效:
def scan_dict(d):
counts = {}
for k, v in d.items():
if isinstance(v, dict):
subcounts = scan_dict(v)
for subkey, subcount in subcounts.items():
new_key = str(k) + "-" + str(subkey)
count = counts.get(new_key, 0)
counts[new_key] = count + subcount
key = str(k)
count = counts.get(key, 0)
counts[key] = count + 1
return counts
def scan_all_dicts(ds):
total_counts = {}
for d in ds:
counts = scan_dict(d)
for k, v in counts.items():
count = total_counts.get(k, 0)
total_counts[k] = count + v
return total_counts
基本上,有一个递归函数可以扫描每个字典并计算其中的所有内容,并且任何孩子都可以查找它。
“driver”是第二个函数,它接受一个可迭代的(例如列表)dicts并在第一个函数中运行它们,然后返回所有值的扁平列表。
我没有检查这些值以确保它们“不是空白”;我会把它留给你。
答案 2 :(得分:0)
递归是解决这个问题的最恐怖的方式;但是,此解决方案利用装饰器更新全局字典以存储整体填充率。使用collections.defaultdict
时,final_dict
的每次换行都可以多次更新get_occurences
:
from collections import defaultdict
import re
final_dict = defaultdict(int)
def fill(f):
def update_count(structure, last):
data = f(structure, last=None)
def update_final(d):
for a, b in d.items():
global final_dict
final_dict[a] += int(bool(b)) if not isinstance(b, dict) else int(bool(update_final(b)))
update_final(data)
return update_count
@fill
def get_occurences(d, last=None):
return {"{}-{}".format(last, a) if last else a:int(bool(b)) if not isinstance(b, dict) else get_occurences(b, a) for a, b in d.items()}
structures = [{'key1':'value', 'key2':{'key3':'', 'key4':12345}, 'key5':[]}, {'key1':18, 'key2':'value1', 'key3':['James', 'Bob', 'Bill']},{'key1':'value2', 'key2':{'key3':'233', 'key4':12345}, 'key5':100}]
for structure in structures:
get_occurences(structure)
for i in sorted(final_dict.items(), key=lambda (c, d):(int(re.findall('\d+$', c)[0]), bool(re.findall('\w+-\w+', c)))):
print("{}: {}".format(*i))
输出:
{'key2-key3': 1, 'key2-key4': 2, 'key1': 3, 'key2': 1, 'key5': 1, 'key3': 1}
输出:
key1: 3
key2: 1
key3: 1
key2-key3: 1
key2-key4: 2
key5: 1
答案 3 :(得分:0)
您可以尝试这样的事情:
example1_dict = {
'key1': 'value',
'key2': {
'key3': '',
'key4': 12345,
},
'key5': list()
}
example={}
for ka,l in example1_dict.items():
if isinstance(l,dict):
def hi(fg, track=''):
print(fg)
for i, k in fg.items():
track="{}-{}".format(ka,i)
if i not in example:
example[track] = 1
else:
example[track] += 1
if isinstance(k, dict):
return hi(k)
print(hi(l))
elif l:
example[ka]=1
else:
example[ka]=0
print(example)
输出:
{'key5': 0, 'key2-key4': 1, 'key1': 1, 'key2-key3': 1}