python - 嵌套数据的复杂排序

时间:2015-08-14 21:16:55

标签: python postgresql sorting sorted

我正在从postgres(jsonb类型)中检索数据,我需要返回一个OrderedDict,它具有可预测的人和机器消耗顺序。有一些常用(ish)键应该用于指示常见类型值的优先级(基于预定义的顺序)[如果定义了sort_order]。否则,排序顺序应该回退到基于密钥的词典排序。

一般意图是对复合词汇进行可预测的,“理智的”表示。

基本算法是:

  1. dicts来自列表
  2. 非迭代的值或映射优先于对象。
  3. 其键不在sort_order中的相同类型的值被视为相等,应按字典顺序排序。
  4. 如果类型(A [0])==类型(B)和sort_order中的A [0]而不是sort_order中的B [0]
  5. ,则Obj A优先于Obj B.
  6. 如果all([type(A 1)== type(B 1),sort_order中的A [0],sort_order中的B [0])则是对象的索引位置key是优先决定因素。
  7. 我尝试了几种实现,但是我还没有想出任何我认为是pythonic / elegant的东西。

    这是最新的化身

    # -*- coding: utf-8 -*-
    
    import json
    from collections import OrderedDict
    
    
    def dict_sort(obj, sort_order=None):
        def seq(s, o=None, v=None):
            return str(s) + str(o) + str(v) if o is not None else str(s)
    
        order_seq = None
        if sort_order is not None and obj[0] in sort_order:
            order_seq = [i for i, v in enumerate(sort_order) if v == obj[0]][0]
    
        if isinstance(obj[1], dict):
            return seq(2, order_seq, obj[0]) if order_seq else seq(3)
        elif isinstance(obj[1], list):
            return seq(4, order_seq, obj[0]) if order_seq else seq(5)
        else:
            return seq(0, order_seq, obj[0]) if order_seq else seq(1)
    
    
    def comp_sort(obj, sort_order=None):
        data = OrderedDict()
        if isinstance(obj, dict):
            for key, value in sorted(obj.items(), key=lambda d: dict_sort(d, sort_order)):
                if isinstance(value, dict) or isinstance(value, list):
                    data[key] = comp_sort(value, sort_order)
                else:
                    data[key] = value
        elif isinstance(obj, list):
            try:
                return sorted(obj)
            except:
                items = []
                for value in obj:
                    if isinstance(value, dict) or isinstance(value, list):
                        items.append(comp_sort(value, sort_order))
                    else:
                        items.append(value)
                return items
        return data
    
    # thx herk
    

    Here is a sample data set

1 个答案:

答案 0 :(得分:0)

需要一些炖煮,但我终于能够找到满足所有要求的解决方案。它有点慢,但它确实有效。

反馈将不胜感激!

# -*- coding: utf-8 -*-

from __future__ import print_function
from functools import cmp_to_key
import collections
import urllib2
import json

def sort_it(obj=None, sort_order=None):
    """Sort a composite python object.

    :param obj: Python object
    :param sort_order: optional custom sort order
    :rtype: OrderedDict
    :returns: Sorted composite object.
    """

    # TODO: Refactor to use key rather than cmp (cmp is not supported in python3)
    # using cmp_to_key as transitional solution
    text_types = (basestring, int, float, complex)
    iterable_types = (list, tuple, set, frozenset)

    def cmp_func(a, b):
        """Function passed as `cmp` arg to sorted method

        Basic Algorithm
        - text_types take precedence over non text_types
        - Mapping types take precedence over iterable container types
        - Values of the same (or similar) type:
           - if sort_order is defined
              - if both keys are in sort order, the key index position determines precedence
              - if only one of the keys are in sort order then it takes precedence
              - if neither keys are in sort_order their lexicographic order is the determinant
           - otherwise, fall back to lexicographic ordering

        :param a: first arg passed to sorted's cmp arg
        :param b: second arg passed to sorted's cmp arg
        :rtype: int
        :return: int to determine which object (a/b) should take precedence
        """

        # ensure a and b are k/v pairs
        if not any([len(a) == 2, len(b) == 2]):
            return 0

        # text_types take precedence over non-text types
        elif isinstance(a[1], text_types) and not isinstance(b[1], text_types):
            return -1
        elif not isinstance(a[1], text_types) and isinstance(b[1], text_types):
            return 1

        # Mappings take precedence over iterable types
        elif isinstance(a[1], collections.Mapping) and isinstance(b[1], iterable_types):
            return -1
        elif isinstance(b[1], collections.Mapping) and isinstance(a[1], iterable_types):
            return 1

        # if type of values are of the same/similar type
        elif any([isinstance(a[1], text_types) and isinstance(b[1], text_types),
                  isinstance(a[1], iterable_types) and isinstance(b[1], iterable_types),
                  isinstance(a[1], collections.Mapping) and isinstance(b[1], collections.Mapping),
                  isinstance(a[1], type(b[1])),
                  ]):
            if sort_order:
                if any([a[0] in sort_order, b[0] in sort_order]):
                    if a[0] in sort_order and b[0] not in sort_order:
                        return -1
                    if b[0] in sort_order and a[0] not in sort_order:
                        return 1
                    if a[0] in sort_order and b[0] in sort_order:
                        if sort_order.index(a[0]) > sort_order.index(b[0]):
                            return 1
                        else:
                            return -1
            # no sort_order ( or keys not in sort_order) -- sort lexicographically
            if sorted([a[0].lower(), b[0].lower()]).index(a[0].lower()) == 0:
                return -1
            elif sorted([a[0].lower(), b[0].lower()]).index(a[0].lower()) == 1:
                return 1
        else:
            raise ValueError('Unhandled condition for values %s, %s' % (a, b))

    if isinstance(obj, collections.Mapping):
        return collections.OrderedDict(
            (key, sort_it(value, sort_order=sort_order)) for key, value in
            sorted(obj.items(), key=cmp_to_key(cmp_func)))
    elif isinstance(obj, iterable_types):
        return type(obj)([sort_it(value, sort_order=sort_order) for value in obj])
    else:
        return obj

sort_order = [
    'id',
    'rn',
    'dn',
    'vendor',
    'model',
    'serial',
    'name',
    'description',
    'tray'
    'presence'
]

sample_data_uri = 'https://bit.ly/1jOpQF2'


### EXAMPLE - Sans sort order

print(json.dumps(sort_it(json.loads(urllib2.urlopen(sample_data_uri).read())), indent=4))

### EXAMPLE - with sort_order

print(json.dumps(sort_it(json.loads(urllib2.urlopen(sample_data_uri).read()), sort_order=sort_order), indent=4))