def index_dir(self, base_path):
num_files_indexed = 0
allfiles = os.listdir(base_path)
#print allfiles
num_files_indexed = len(allfiles)
#print num_files_indexed
docnumber = 0
self._inverted_index = {} #dictionary
for file in allfiles:
self.documents = [base_path+file] #list of all text files
f = open(base_path+file, 'r')
lines = f.read()
# Tokenize the file into words
tokens = self.tokenize(lines)
docnumber = docnumber + 1
print 'docnumber', docnumber
for term in tokens:
# check if the key already exists in the dictionary, if yes,
# just add a new value for the key
#if self._inverted_index.has_key(term)
if term in sorted(self._inverted_index.keys()):
docnumlist = self._inverted_index.get(term)
docnumlist = docnumlist.append(docnumber)
else:
# if the key doesn't exist in dictionary, add the key (term)
# and associate the docnumber value with it.
self._inverted_index = self._inverted_index.update({term: docnumber})
#self._inverted_index[term] = docnumber
f.close()
print 'dictionary', self._inverted_index
print 'keys', self._inverted_index.keys()
return num_files_indexed
我正在开展一个信息检索项目,我们应该抓取多个文本文件,对文件进行标记,并将这些单词存储在倒排列表(字典)数据结构中。
例如:
doc1.txt:“狗跑了”
doc2.txt:“猫睡了”
_inverted_index = {
'the':[0,1],
'狗':[0],
'跑':[0],
'猫':[1],
'睡了':[1]
}
其中0,1是docIDs
我收到以下错误: 'Nontype'对象没有属性键。线#95
非常感谢所有帮助。
答案 0 :(得分:3)
当self._inverted_index
是字典时,self._inverted_index.update
会就地更新并返回None
(就像大多数mutator一样)。因此,代码中的灾难性错误就在于:
self._inverted_index = self._inverted_index.update({term: docnumber})
将self._inverted_index
设置为None
。只需将其更改为
self._inverted_index.update({term: docnumber})
简单地接受就地更新(变异)并且没有错误的分配!