为了说明我的意思,这是一个例子
messages = [
('Ricky', 'Steve', 'SMS'),
('Steve', 'Karl', 'SMS'),
('Karl', 'Nora', 'Email')
]
我想将此列表和组的定义转换为整数列表和查找字典,以便组中的每个元素都获得唯一的ID。该id应映射到查找表中的元素,如此
messages_int, lookup_table = create_lookup_list(
messages, ('person', 'person', 'medium'))
print messages_int
[ (0, 1, 0),
(1, 2, 0),
(2, 3, 1) ]
print lookup_table
{ 'person': ['Ricky', 'Steve', 'Karl', 'Nora'],
'medium': ['SMS', 'Email']
}
我想知道是否有一个优雅和pythonic解决这个问题。
我也比create_lookup_list
等更好的术语
答案 0 :(得分:3)
defaultdict
结合itertools.count().next
方法是将标识符分配给唯一项的好方法。以下是如何在您的案例中应用此内容的示例:
from itertools import count
from collections import defaultdict
def create_lookup_list(data, domains):
domain_keys = defaultdict(lambda:defaultdict(count().next))
out = []
for row in data:
out.append(tuple(domain_keys[dom][val] for val, dom in zip(row, domains)))
lookup_table = dict((k, sorted(d, key=d.get)) for k, d in domain_keys.items())
return out, lookup_table
修改:请注意,{3}中的count().next
变为count().__next__
或lambda: next(count())
。
答案 1 :(得分:2)
我的长度和复杂程度相同:
import collections
def create_lookup_list(messages, labels):
# Collect all the values
lookup = collections.defaultdict(set)
for msg in messages:
for l, v in zip(labels, msg):
lookup[l].add(v)
# Make the value sets lists
for k, v in lookup.items():
lookup[k] = list(v)
# Make the lookup_list
lookup_list = []
for msg in messages:
lookup_list.append([lookup[l].index(v) for l, v in zip(labels, msg)])
return lookup_list, lookup
答案 2 :(得分:2)
在Otto的回答中(或其他任何人使用string-> id dicts),我会替换(如果过度追求速度是你的话):
# create the lookup table
lookup_dict = {}
for group in indices:
lookup_dict[group] = sorted(indices[group].keys(),
lambda e1, e2: indices[group][e1]-indices[group][e2])
通过
# k2i must map keys to consecutive ints [0,len(k2i)-1)
def inverse_indices(k2i):
inv=[0]*len(k2i)
for k,i in k2i.iteritems():
inv[i]=k
return inv
lookup_table = dict((g,inverse_indices(gi)) for g,gi in indices.iteritems())
这更好,因为直接分配到逆数组中的每个项目比分类更快。
答案 3 :(得分:1)
这是我自己的解决方案 - 我怀疑它是最好的
def create_lookup_list(input_list, groups):
# use a dictionary for the indices so that the index lookup
# is fast (not necessarily a requirement)
indices = dict((group, {}) for group in groups)
output = []
# assign indices by iterating through the list
for row in input_list:
newrow = []
for group, element in zip(groups, row):
if element in indices[group]:
index = indices[group][element]
else:
index = indices[group][element] = len(indices[group])
newrow.append(index)
output.append(newrow)
# create the lookup table
lookup_dict = {}
for group in indices:
lookup_dict[group] = sorted(indices[group].keys(),
lambda e1, e2: indices[group][e1]-indices[group][e2])
return output, lookup_dict
答案 4 :(得分:1)
这更简单,更直接。
from collections import defaultdict
def create_lookup_list( messages, schema ):
def mapped_rows( messages ):
for row in messages:
newRow= []
for col, value in zip(schema,row):
if value not in lookups[col]:
lookups[col].append(value)
code= lookups[col].index(value)
newRow.append(code)
yield newRow
lookups = defaultdict(list)
return list( mapped_rows(messages) ), dict(lookups)
如果查找是正确的词典,而不是列表,则可以进一步简化 使“查找表”具有以下结构
{ 'person': {'Ricky':0, 'Steve':1, 'Karl':2, 'Nora':3},
'medium': {'SMS':0, 'Email':1}
}
它可以进一步降低复杂性。
您可以将查找的此工作副本转换为反向,如下所示:
>>> lookups = { 'person': {'Ricky':0, 'Steve':1, 'Karl':2, 'Nora':3},
'medium': {'SMS':0, 'Email':1}
}
>>> dict( ( d, dict( (v,k) for k,v in lookups[d].items() ) ) for d in lookups )
{'person': {0: 'Ricky', 1: 'Steve', 2: 'Karl', 3: 'Nora'}, 'medium': {0: 'SMS', 1: 'Email'}}
答案 5 :(得分:0)
这是我的解决方案,它并不是更好 - 它只是不同:)
def create_lookup_list(data, keys):
encoded = []
table = dict([(key, []) for key in keys])
for record in data:
msg_int = []
for key, value in zip(keys, record):
if value not in table[key]:
table[key].append(value)
msg_int.append(table[key].index(value))
encoded.append(tuple(msg_int))
return encoded, table
答案 6 :(得分:0)
这是我的,内部函数允许我将index-tuple写为生成器。
def create_lookup_list( data, format):
table = {}
indices = []
def get_index( item, form ):
row = table.setdefault( form, [] )
try:
return row.index( item )
except ValueError:
n = len( row )
row.append( item )
return n
for row in data:
indices.append( tuple( get_index( item, form ) for item, form in zip( row, format ) ))
return table, indices