我想按键合并两个kyoto cabinet b-tree数据库。 (kyoto cabinet python api)。 结果列表应包含两个输入dbs中任何一个的唯一键(及其值)。
以下代码有效,但我觉得它很难看 left_generator / right_generator是两个cursor个对象。 特别奇怪的是,如果生成器耗尽,get()将返回None。
def merge_join_kv(left_generator, right_generator):
stop = False
while left_generator.get() or right_generator.get():
try:
comparison = cmp(right_generator.get_key(), left_generator.get_key())
if comparison == 0:
yield left_generator.get_key(), left_generator.get_value()
left_generator.next()
right_generator.next()
elif (comparison < 0) or (not left_generator.get() or not right_generator.get()):
yield right_generator.get_key(), right_generator.get_value()
right_generator.next()
else:
yield left_generator.get_key(), left_generator.get_value()
left_generator.next()
except StopIteration:
if stop:
raise
stop = True
一般来说:是否有一个函数/ lib合并连接生成器与cmp()?
答案 0 :(得分:4)
我认为这就是你所需要的; orderedMerge基于Gnibbler的代码,但添加了自定义键函数和唯一参数,
import kyotocabinet
import collections
import heapq
class IterableCursor(kyotocabinet.Cursor, collections.Iterator):
def __init__(self, *args, **kwargs):
kyotocabinet.Cursor.__init__(self, *args, **kwargs)
collections.Iterator.__init__(self)
def next():
"Return (key,value) pair"
res = self.get(True)
if res is None:
raise StopIteration
else:
return res
def orderedMerge(*iterables, **kwargs):
"""Take a list of ordered iterables; return as a single ordered generator.
@param key: function, for each item return key value
(Hint: to sort descending, return negated key value)
@param unique: boolean, return only first occurrence for each key value?
"""
key = kwargs.get('key', (lambda x: x))
unique = kwargs.get('unique', False)
_heapify = heapq.heapify
_heapreplace = heapq.heapreplace
_heappop = heapq.heappop
_StopIteration = StopIteration
# preprocess iterators as heapqueue
h = []
for itnum, it in enumerate(map(iter, iterables)):
try:
next = it.next
data = next()
keyval = key(data)
h.append([keyval, itnum, data, next])
except _StopIteration:
pass
_heapify(h)
# process iterators in ascending key order
oldkeyval = None
while True:
try:
while True:
keyval, itnum, data, next = s = h[0] # get smallest-key value
# raises IndexError when h is empty
# if unique, skip duplicate keys
if unique and keyval==oldkeyval:
pass
else:
yield data
oldkeyval = keyval
# load replacement value from same iterator
s[2] = data = next() # raises StopIteration when exhausted
s[0] = key(data)
_heapreplace(h, s) # restore heap condition
except _StopIteration:
_heappop(h) # remove empty iterator
except IndexError:
return
然后您的功能可以
完成from operator import itemgetter
def merge_join_kv(leftGen, rightGen):
# assuming that kyotocabinet.Cursor has a copy initializer
leftIter = IterableCursor(leftGen)
rightIter = IterableCursor(rightGen)
return orderedMerge(leftIter, rightIter, key=itemgetter(0), unique=True)
答案 1 :(得分:1)
Python 2.6在heapq中有一个合并,但它不支持用户定义的cmp / key func
def merge(*iterables):
'''Merge multiple sorted inputs into a single sorted output.
Similar to sorted(itertools.chain(*iterables)) but returns a generator,
does not pull the data into memory all at once, and assumes that each of
the input streams is already sorted (smallest to largest).
>>> list(merge([1,3,5,7], [0,2,4,8], [5,10,15,20], [], [25]))
[0, 1, 2, 3, 4, 5, 5, 7, 8, 10, 15, 20, 25]
'''
_heappop, _heapreplace, _StopIteration = heappop, heapreplace, StopIteration
h = []
h_append = h.append
for itnum, it in enumerate(map(iter, iterables)):
try:
next = it.next
h_append([next(), itnum, next])
except _StopIteration:
pass
heapify(h)
while 1:
try:
while 1:
v, itnum, next = s = h[0] # raises IndexError when h is empty
yield v
s[0] = next() # raises StopIteration when exhausted
_heapreplace(h, s) # restore heap condition
except _StopIteration:
_heappop(h) # remove empty iterator
except IndexError:
return