如何通过在Whoosh中获取额外的字段值来设计自己的评分功能

时间:2014-05-29 07:33:47

标签: python information-retrieval whoosh

知道飞快移动make your own scoring function

这里我打算做的是根据频率加上已经存在的值对文本进行评分 索引作为字段的一部分。基于以前的频率值,我没有问题得分。 但是添加后一部分我有问题。以下是整体代码:

#!/usr/bin/env python
""" 
Information retrieval
For fast searching 

Based on Python 2.7.6
"""

import os, os.path

# Whoosh related
from whoosh.index import create_in
import whoosh.index as index
from whoosh.fields import *
from whoosh.filedb.filestore import FileStorage
from whoosh import scoring 


def myscore_fn(searcher, fieldname, text, matcher):
    """
    My weighting function 
    """
    # currently just taking frequency count
    freq = matcher.value_as("frequency")

    # What I want to do is to return
    #        freq + myscore
    # where myscore is the field as define in the Schema below. 
    return freq

def create_index(indexdir="whooshindex",indexname="usages"):
    """docstring for create_index"""

    if not os.path.exists(indexdir):
        os.mkdir(indexdir)

    # set (stored=True) if you want the fields to be
    # displayed later 
    schema = Schema(title=TEXT(stored=True), myscore=NUMERIC, path=ID(stored=True),
            content=TEXT(stored=True))
    ix = create_in(indexdir, schema=schema)
    writer = ix.writer()
    writer.add_document(title=u"First document", myscore=1000, path=u"/a",content=u"This is the first document we've added")
    writer.add_document(title=u"Second document", myscore=100, path=u"/b",content=u"The second, third one is even more interesting!")
    third_title = "Third document"
    third_path =  "/c"
    third_content = "This is. This is third third the third text"
    third_score = 133
    writer.add_document(title = u(third_title), path= u(third_path),
            myscore=third_score, content = u(third_content))
    writer.commit()
    return



def search(idx_dir,  kw):
    """docstring for search"""

    # Open index
    storage = FileStorage(idx_dir)
    ix = storage.open_index()

    # Searching
    from whoosh.qparser import QueryParser

    pos_weighting = scoring.FunctionWeighting(myscore_fn)

    with ix.searcher(weighting=pos_weighting) as searcher:
        # fields where you want to perform the search 
        search_field = "content"
        query = QueryParser(search_field, ix.schema).parse(kw)
        results = searcher.search(query, limit=10)
        for result in results:
            rank_score = result.score
            print result["title"], " / ",  result["content"], " (Score: ", rank_score, ")"
    return  

if __name__ == '__main__':

    idx_dirname = "whooshindex"
    idx_name = "usages"


    create_index(idx_dirname,idx_name)
    search(idx_dirname, "third")  # search terms

目前打印此结果:

This is. This is third third the third text  (Score: 3)
The second, third one is even more interesting!  (Score: 1)

我希望它返回的是:

This is. This is third third the third text  (Score: 136)
The second, third one is even more interesting!  (Score: 101)

0 个答案:

没有答案