给出一本字典:
sample = {
'123': 'Foo',
'456': 'Bar',
'789': 'Hello',
'-111': 'World'
}
从字典中获取最接近(或更少)密钥的最强高效方式(方法和/或数据结构)是什么?
注意:
1.即使key是一个字符串,比较也应该是数字
2.键可以是“否定的”。
示例:
get_nearest_less_element(sample, '456') # returns 'Bar'
get_nearest_less_element(sample, '235') # returns 'Foo'
get_nearest_less_element(sample, '455') # returns 'Foo'
get_nearest_less_element(sample, '999') # returns 'Hello'
get_nearest_less_element(sample, '0') # returns 'World'
get_nearest_less_element(sample, '-110') # returns 'World'
get_nearest_less_element(sample, '-999') # should return an error since it is beyond the lower bound
其他问题:
给定相同的数据集,sorted OrderedDict
或List of Tuples
或任何其他python数据结构是更好的方法吗?
答案 0 :(得分:5)
def get_nearest_less_element(d, k):
k = int(k)
return d[str(max(key for key in map(int, d.keys()) if key <= k))]
编辑以使用@Paul Hankin的代码更新,但使用<=
我不确定它是否需要分支。将所有键转换为数字,找到小于或等于k的键,得到最大值 - 如果k在那里你得到它,否则你将获得下一个最大值 - 转换回字符串并在字典中查找。
我不知道它是否是最有效的想法;因为你获得的字典是无序的,你必须遍历每个元素,因为它们中的任何一个都可能是下一个元素,并且由于你需要数字比较,你必须将它们全部转换为整数。在我看来,任何其他结构都需要更多的初始化成本,因为您必须首先检查每个项目以将其放入您的结构中。
但这取决于你的用例 - 如果k
很可能在字典中,那么将我的代码更改为if k in d: return d[k] else: ...
分支是有意义的,因为在这种情况下不执行生成器表达式会更快。如果它很可能不在字典中,它就不会有多大帮助。
一个伪代码(未经测试的)版本对它们的键进行排序如下:这一次使用起来会慢一些,但查询的速度可能更快:
# cache to store sorted keys between function calls
# nb. you will have to invalidate this cache (reset to [])
# when you get a new dictionary
sorted_keys = []
def get_nearest_less_element(d, k):
if k in d: # quick return if key is in dict
return d[k]
else:
# costly sort of the keys, only do this once
if not sorted_keys:
sorted_keys = sorted(int(key) for key in d.keys())
# quick run through the sorted key list up
# to latest item less than k
k = int(k)
nearest = sorted_keys[0]
for item in sorted_keys:
if item < k:
nearest = item
else:
break
return d[str(item)]
答案 1 :(得分:5)
如果键存在,则下面的模块返回值,否则它会在小于输入键的键列表中找到最大键。
def get_nearest_less_element(sample,key):
if key in sample:
return sample[key]
else:
return sample[str(max(x for x in sample.keys() if int(x) < int(key)))]
print get_nearest_less_element(sample, '456')
print get_nearest_less_element(sample, '235')
print get_nearest_less_element(sample, '455')
print get_nearest_less_element(sample, '999')
<强>输出:强>
酒吧
Foo
Foo
您好
修改强> 根据保罗的评论编辑了答案。
答案 2 :(得分:2)
如果您只创建或更新样本一次或不经常,但重复查找值,那么在O(n log n)时间内预先计算排序的数字列表将是最有效的。然后整个字典都不需要扫描;二进制搜索提供O(log n)访问。有一个python库模块函数,bisect。
from bisect import bisect
def nearest_index(sorted_keys, elem):
idx = bisect(sorted_keys, elem)
if idx >= len(sorted_keys):
idx = len(sorted_keys) - 1
elif idx > 0:
# find closest of the two neighbors
if elem <= (sorted_keys[idx-1] + sorted_keys[idx])/2.0:
idx -= 1
return idx
sample = {'123': 'Foo', '456': 'Bar', '789': 'Hello'}
sorted_keys = sorted(int(k) for k in sample.keys())
def get_nearest_element(sample, sorted_keys, elem):
elem_int = int(elem)
idx_nearest = nearest_index(sorted_keys, elem_int)
return sample[str(sorted_keys[idx_nearest])]
for elem in ['456', '235', '455', '999']:
print get_nearest_element(sample, sorted_keys, elem)
答案 3 :(得分:2)
根据您的数据集,在设置和查找时间复杂度方面最有效的数据结构是binary search tree,它为您提供O(n log n)设置和O(log n)具有O(n)空间复杂度的查找时间复杂度。
标准BST算法不包括您的两个特殊约束(据我所知)
以下是基于this implementation的BST实施:
class Node(object):
def __init__(self, key, value, parent):
self.left = None
self.right = None
self.value = value
self.key = key
self.parent = parent
def __str__(self):
return ":".join(map(str, (self.key, self.value)))
class BinarySearchTree(object):
def __init__(self):
self.root = None
def getRoot(self):
return self.root
def __setitem__(self, key, value):
if(self.root == None):
self.root = Node(key, value, None)
else:
self._set(key, value, self.root)
def _set(self, key, value, node):
if key == node.key:
node.value = value
elif key < node.key:
if(node.left != None):
self._set(key, value, node.left)
else:
node.left = Node(key, value, node)
else:
if(node.right != None):
self._set(key, value, node.right)
else:
node.right = Node(key, value, node)
def __contains__(self, key):
return self._get(key) != None
def __getitem__(self, key):
if(self.root != None):
return self._get(key, self.root)
else:
return None
def _get(self, key, node):
if key == node.key:
return node.value
elif key < node.key and node.left != None:
return self._get(key, node.left)
elif key > node.key and node.right != None:
return self._get(key, node.right)
这是一个满足要求1的子类:
class FuzzySearchTree(BinarySearchTree):
def _get(self, key, node):
if key == node.key:
return node.value
elif key < node.key:
if node.left != None:
return self._get(key, node.left)
else:
return self._checkMin(key, node)
else:
if node.right != None:
return self._get(key, node.right)
else:
return node.value # found the closest match that is larger
def _checkMin(self, key, node):
return node.value
要满足要求2,您需要跟踪树中的最小值。您应该通过跟踪插入时的最小值来实现此目的,但这是一种不同的方法。这种方法不是非常有效,但它应该仍然是o(3 log n)== O(log n),所以它也不错。如果你真的不需要这个,我就不会打扰它了。
class MinBoundedFuzzySearchTree(FuzzySearchTree):
def _checkMin(self, key, node):
# Unless the value is lower than the minimum value in the tree # Not advised
next = node.parent
while next.parent != None:
next = next.parent # Go up the tree to the top
next = next.left
while next.left != None:
next = next.left # Go down the tree to the left
if next.key > key:
return None # outside the the range of the tree
# Return the max value less than the key, which is by definition the parent
return node.parent.value
以下是一些伪测试:
tree = BinarySearchTree()
tree[123] = 'Foo'
tree[456] = 'Bar'
tree[789] = 'Hello'
tree[-111] = 'World'
print "BST(456) == 'Bar': " + str(tree[456])
print "BST(235) == None: " + str(tree[235])
print "BST(455) == None: " + str(tree[455])
print "BST(999) == None: " + str(tree[999])
print "BST(0) == None: " + str(tree[0])
print "BST(123) == 'Foo': " + str(tree[123])
print "BST(-110) == None: " + str(tree[-110])
print "BST(-999) == None: " + str(tree[-999])
tree = FuzzySearchTree()
tree[123] = 'Foo'
tree[456] = 'Bar'
tree[789] = 'Hello'
tree[-111] = 'World'
print
print "FST(456) == 'Bar': " + str(tree[456])
print "FST(235) == 'Foo': " + str(tree[235])
print "FST(455) == 'Foo': " + str(tree[455])
print "FST(999) == 'Hello': " + str(tree[999])
print "FST(0) == 'World': " + str(tree[0])
print "FST(123) == 'Foo': " + str(tree[123])
print "FST(-110) == 'World': " + str(tree[-110])
print "FST(-999) == 'World': " + str(tree[-999])
tree = MinBoundedFuzzySearchTree()
tree[123] = 'Foo'
tree[456] = 'Bar'
tree[789] = 'Hello'
tree[-111] = 'World'
print
print "MBFST(456) == 'Bar': " + str(tree[456])
print "MBFST(235) == 'Foo': " + str(tree[235])
print "MBFST(455) == 'Foo': " + str(tree[455])
print "MBFST(999) == 'Hello': " + str(tree[999])
print "MBFST(0) == 'World': " + str(tree[0])
print "MBFST(123) == 'Foo': " + str(tree[123])
print "MBFST(-110) == 'World': " + str(tree[-110])
print "MBFST(-999) == None: " + str(tree[-999])
这就是打印的内容:
"""
BST(456) == 'Bar': Bar
BST(235) == None: None
BST(455) == None: None
BST(999) == None: None
BST(0) == None: None
BST(123) == 'Foo': Foo
BST(-110) == None: None
BST(-999) == None: None
FST(456) == 'Bar': Bar
FST(235) == 'Foo': Foo
FST(455) == 'Foo': Foo
FST(999) == 'Hello': Hello
FST(0) == 'World': World
FST(123) == 'Foo': Foo
FST(-110) == 'World': World
FST(-999) == 'World': Foo
MBFST(456) == 'Bar': Bar
MBFST(235) == 'Foo': Foo
MBFST(455) == 'Foo': Foo
MBFST(999) == 'Hello': Hello
MBFST(0) == 'World': World
MBFST(123) == 'Foo': Foo
MBFST(-110) == 'World': World
MBFST(-999) == None: None
"""
答案 4 :(得分:1)
这是一个解决方案。根据数字比较查找最接近的键:
sample = {'123': 'Foo', '456': 'Bar', '789': 'Hello'}
def get_nearest_less_element(inpDict, targetNum):
diff = 2**32 - 1 # Very big number.
currentKey = None
for i in sample.keys():
newDiff = abs(int(i) - targetNum)
if newDiff < diff:
currentKey = i
diff = newDiff
return inpDict[currentKey]
print(get_nearest_less_element(sample, 500))
# Prints Bar
这只是字典中的一个循环,因此在O(n)时间和O(1)额外空间中运行。
答案 5 :(得分:1)
我是这样做的:
def get_nearest_less_element(sample, key):
try:
if key not in sample:
candidates = []
for keys in sample:
if int(keys) < int(key):
candidates.append(keys)
return sample[max(candidates)]
return sample[key]
except ValueError:
print("key is beyond lower bounds")