知道飞快移动make your own scoring function。
这里我打算做的是根据频率加上已经存在的值对文本进行评分 索引作为字段的一部分。基于以前的频率值,我没有问题得分。 但是添加后一部分我有问题。以下是整体代码:
#!/usr/bin/env python
"""
Information retrieval
For fast searching
Based on Python 2.7.6
"""
import os, os.path
# Whoosh related
from whoosh.index import create_in
import whoosh.index as index
from whoosh.fields import *
from whoosh.filedb.filestore import FileStorage
from whoosh import scoring
def myscore_fn(searcher, fieldname, text, matcher):
"""
My weighting function
"""
# currently just taking frequency count
freq = matcher.value_as("frequency")
# What I want to do is to return
# freq + myscore
# where myscore is the field as define in the Schema below.
return freq
def create_index(indexdir="whooshindex",indexname="usages"):
"""docstring for create_index"""
if not os.path.exists(indexdir):
os.mkdir(indexdir)
# set (stored=True) if you want the fields to be
# displayed later
schema = Schema(title=TEXT(stored=True), myscore=NUMERIC, path=ID(stored=True),
content=TEXT(stored=True))
ix = create_in(indexdir, schema=schema)
writer = ix.writer()
writer.add_document(title=u"First document", myscore=1000, path=u"/a",content=u"This is the first document we've added")
writer.add_document(title=u"Second document", myscore=100, path=u"/b",content=u"The second, third one is even more interesting!")
third_title = "Third document"
third_path = "/c"
third_content = "This is. This is third third the third text"
third_score = 133
writer.add_document(title = u(third_title), path= u(third_path),
myscore=third_score, content = u(third_content))
writer.commit()
return
def search(idx_dir, kw):
"""docstring for search"""
# Open index
storage = FileStorage(idx_dir)
ix = storage.open_index()
# Searching
from whoosh.qparser import QueryParser
pos_weighting = scoring.FunctionWeighting(myscore_fn)
with ix.searcher(weighting=pos_weighting) as searcher:
# fields where you want to perform the search
search_field = "content"
query = QueryParser(search_field, ix.schema).parse(kw)
results = searcher.search(query, limit=10)
for result in results:
rank_score = result.score
print result["title"], " / ", result["content"], " (Score: ", rank_score, ")"
return
if __name__ == '__main__':
idx_dirname = "whooshindex"
idx_name = "usages"
create_index(idx_dirname,idx_name)
search(idx_dirname, "third") # search terms
目前打印此结果:
This is. This is third third the third text (Score: 3)
The second, third one is even more interesting! (Score: 1)
我希望它返回的是:
This is. This is third third the third text (Score: 136)
The second, third one is even more interesting! (Score: 101)