我已经构建了一个文本集合的倒排索引字典,需要使用阻塞压缩k = 8来压缩字典,并且在发布文件中,使用gamma编码的docids之间的间隙
def createCompressedIndex(dictionary_uncomp_v1):
for term in dictionary_uncomp_v1.keys():
entry = dictionary_uncomp_v1.get(term)
postingList = []
prevId = 0
pEntry = PostingEntry(0,0,0,0)
for pEntry in entry.postingList:
docId = getGammaCode(pEntry.docId - prevId)
frequency = getGammaCode(pEntry.termFreq)
newPEntry = PostingEntry(docId,frequency,0,0)
postingList.extend(newPEntry)
prevId = pEntry.docId
ptemp = docId+frequency
docFrequency = getGammaCode(entry.docFreq)
entrytemp = ptemp+docFrequency
totalTermFreq = getGammaCode(entry.totTermFreq)
compressedEntry = DictEntry(term,docFrequency,totalTermFreq,postingList)
dictionary_comp_v1[term] = bytearray(entrytemp)