我在python中有这样的字典:
{
'k1': 'AAB',
'k2': 'AAB',
'k3': 'ABA',
'k4': 'AAB',
'k5': 'A--',
'k6': 'AB-',
'k7': 'BB-',
'k8': 'B--',
'k9': 'B-B',
'k10': '--C',
}
在哪里' - '是一个通配符。我想通过按键分组来合并它们
{
'k1, k2, k4, [k5]' : 'AAB',
'k3, (k6), [k5]' : 'ABA',
'{k7, k8, k9}' : 'B--',
'{k10}' : '--C',
}
带有通配符的项目在与仅一个可能的组合(即k6)匹配时应在括号之间,在与多个组合匹配时(即k5)在括号之间。此外,当带有通配符的值与没有通配符的任何其他完整元素不匹配时,它们应该在大括号之间,并且值应该是具有更多通配符的值。
我已经设法使用正则表达式将字符串与通配符进行比较:
# v1 and v2 are the values to compare
v1_reg = v1.replace("-","?")
cont = 0
for s in v2:
if s == "-":
v1_reg = v1_reg[:cont] + "?" + v1_reg[cont+1:]
cont += 1
if(fnmatch(v2, v1_reg)):
##THERE'S A MATCH
我仍然没有办法进行分组。
答案 0 :(得分:0)
以下代码将进行匹配,并根据需要进行分组,并作为奖励,它将过滤通配符匹配任何内容。我也为你实现了TransUnit
之间的比较。用它来看两个TransUnit
是相等的还是一个更一般。自己检查一下,看看它是如何工作的:
from collections import defaultdict
class Char:
def __init__(self, c):
self._c = c
def match(self, obj): return obj.match_c(self._c)
def match_c(self, cc):
return self._c == cc
def match_w(self): return True
class WildCard:
def __init__(self):
pass
def match(self, obj): return obj.match_w()
def match_c(self, cc): return True
def match_w(self): return True
class TransUnit(object):
def __init__(self, key, s):
self._val = s
self._arr = [WildCard() if c == '-' else Char(c) for c in s]
self._key = key
self._matches = 0
self._has_wildcard = any(map(lambda x: isinstance(x, WildCard), self._arr))
@property
def has_wildcard(self): return self._has_wildcard
@property
def val(self): return self._val
def found_match(self): self._matches += 1
def match(self, obj):
if isinstance(obj, str):
return TransUnit(None, obj).match_arr(self._arr)
return obj.match_arr(self._arr)
def match_arr(self, arr):
if len(self._arr) != len(arr): return False
for i in range(len(arr)):
if not self._arr[i].match(arr[i]):
return False
return True
def compare(self, obj):
return -obj.compare_arr(self._arr)
def compare_arr(self, arr):
if not self.match_arr(arr): raise Exception()
for i in range(len(arr)):
if not isinstance(self._arr[i], WildCard) and isinstance(arr[i], WildCard):
return -1
elif not isinstance(arr[i], WildCard) and isinstance(self._arr[i], WildCard):
return 1
return 0
def __str__(self):
if self._has_wildcard:
if self._matches <= 1:
return '(' + self._key + ')'
else:
return '[' + self._key + ']'
else:
return self._key
__repr__ = __str__
def group(dct):
trans = [TransUnit(k, dct[k]) for k in dct]
wilds = [t for t in trans if t.has_wildcard]
nonwilds = [t for t in trans if not t.has_wildcard]
grouper = defaultdict(list)
for n in nonwilds:
grouper[n.val].append(n)
notmatched = []
for w in wilds:
matched = False
for k, lst in grouper.items():
if w.match(k):
w.found_match()
lst.append(w)
matched = True
if not matched:
notmatched.append(w)
return (grouper, notmatched)
试验:
>>> group({
'k1': 'AAB',
'k2': 'AAB',
'k3': 'ABA',
'k4': 'AAB',
'k5': 'A--',
'k6': 'AB-',
'k7': 'BB-',
'k8': 'B--',
'k9': 'B-B',
})
(defaultdict(<type 'list'>, {'ABA': [k3, (k6), [k5]], 'AAB': [k2, k1, k4, [k5]]}), [(k7), (k9), (k8)])
答案 1 :(得分:0)
编辑: 我已将此修改为您的新规范。
也许有点矫枉过正了。但这很有效。 (请原谅可怕的变量名) 它不是最有效的方法,但它遵循逻辑步骤(至少对我来说)希望这将满足您的需求,但您应该将其作为指导并对其进行改进以满足您的特定需求。
from collections import defaultdict
from operator import itemgetter
from pprint import pprint
from re import match
## Class to help display the results as you want
class Organizer():
def __getitem__(self):
return Organizer()
def __init__(self):
self.normal = []
self.parens = []
self.brackets = []
self.fancy = []
def __str__(self):
ret = ''
for group, a, z in [(self.normal, '', ''),
(self.parens, '(', ')'),
(self.brackets, '[', ']'),
(self.fancy, '{', '}')]:
if group: # If there is something to format
ret += self._frmt(group, a, z)
return ret.strip(', ') # Remove extra formatting.
def _frmt(self, group, a, z):
return ', {}{}{}'.format(a, ', '.join(sorted(group)), z)
## Function that does what you want.
def match_to_wildcards(key_value_pairs):
values_as_keys, partial = defaultdict(Organizer), {}
# Prepare the initial groups.
for key, value in key_value_pairs.items():
if '-' in value:
partial[key] = value
else:
values_as_keys[value].normal.append(key)
# Clean up the partial matches.
partial_matches = {k: [f for f in values_as_keys if match(v.replace('-', '.'), f)] for k, v in partial.items()}
for key, values in partial_matches.items():
if not values:
continue
for value in values:
if len(values) > 1:
values_as_keys[value].brackets.append(key)
else:
values_as_keys[value].parens.append(key)
if key in partial:
partial.pop(key)
# Filter the last unmatched items.
combined_unmatched = {u: [v for v in partial.values() if match(u.replace('-', '.'), v)] for u in partial.values()}
need_to_be_rearranged = {k: [(p, p.count('-')) for p, j in combined_unmatched.items() if k in j] for k, v in combined_unmatched.items() if len(v) == 1}
# Sort the last unmatched items.
for key, value in partial.items():
if value in need_to_be_rearranged:
value_key = max(need_to_be_rearranged[value], key=itemgetter(1))[0]
values_as_keys[value_key].fancy.append(key)
else:
values_as_keys[value].fancy.append(key)
# Return the results, format them.
return {str(v): k for k, v in values_as_keys.items()}
现在使用您的数据运行代码。
## Test
data_set = {
'k1': 'AAB',
'k2': 'AAB',
'k3': 'ABA',
'k4': 'AAB',
'k5': 'A--',
'k6': 'AB-',
'k7': 'BB-',
'k8': 'B--',
'k9': 'B-B',
'k10': '--C'
}
pprint(match_to_wildcards(data_set))
# {'k1, k2, k4, [k5]': 'AAB',
# 'k3, (k6), [k5]': 'ABA',
# '{k10}': '--C',
# '{k7, k8, k9}': 'B--'}