如何使用Elasticsearch配置Haystack上的术语相关性

时间:2014-01-30 11:54:54

标签: python django elasticsearch django-haystack

我已经设置了一个Djangoproject,现在想要将haystack和elasticsearch作为后端包含非数据库搜索。在索引中,我使用EdgeNgramFields作为文本。一切正常,但搜索结果太多了。我已阅读http://django-haystack.readthedocs.org/en/latest/上的文档,但无法找到设置相关性选项的可能性。

在关于Boost的部分中,他们正在谈论得分。基本上我希望能够定义最低分。但我无法找到结果的现场得分的解释。

我错过了什么?是否有可能做的事情,我在谈论什么?

1 个答案:

答案 0 :(得分:1)

在不知道确切的映射和样本数据的情况下,很难告诉您为什么搜索会返回太多结果。但是,我假设你的edgengram标记器使用一个非常小的起始大小的子串,如1或2.使用这样的设置有很多匹配,例如如果你有以下短语,起始大小为1:

a quick brown fox

它将通过以下方式标记:

a q qu qui quick b br bro brow brown f fo fox

这可能会为查询产生大量匹配。作为解决方案,您可以使用另一个起始大小和模糊搜索来查找类似的结果。

但首先,请提供您的确切数据映射,示例数据和查询。

下面是一个示例自定义后端。关键部分是自定义配置部分底部的自定义类型和build_schema函数。

示例自定义后端配置:

HAYSTACK_CONNECTIONS = {
    'default': {
        'ENGINE': 'myservice.apps.search.search_backends.CustomElasticSearchEngine',
        'URL': 'http://127.0.0.1:9200/',
        'INDEX_NAME': 'haystack_prod',
        'TIMEOUT': 60,
    },
}

示例自定义后端:

from django.conf import settings
from haystack.backends.elasticsearch_backend import ElasticsearchSearchBackend, ElasticsearchSearchEngine
from haystack.fields import NgramField
from haystack.models import SearchResult
import requests
import pyelasticsearch

class CustomElasticBackend(ElasticsearchSearchBackend):
    #DEFAULT_ANALYZER = "snowball"
    DEFAULT_SETTINGS = {
        'settings': {
            "analysis": {
                "analyzer": {
                    "ngram_analyzer": {
                        "type": "custom",
                        "tokenizer": "lowercase",
                        "filter": ["haystack_ngram"]
                    },
                    "edgengram_analyzer": {
                        "type": "custom",
                        "tokenizer": "lowercase",
                        "filter": ["lowercase", "asciifolding", "haystack_edgengram"]
                    },
                    "full_text": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["standard", "lowercase", "asciifolding"]
                    },
                    "partial_text": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["standard", "lowercase", "asciifolding", "text_ngrams"]
                    },
                    "partial_text_front": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["standard", "lowercase", "asciifolding", "text_edgengrams_front"]
                    },
                    "partial_text_back": {
                        "type": "custom",
                        "tokenizer": "standard",
                        "filter": ["standard", "lowercase", "asciifolding", "text_edgengrams_back"]
                    }
                },
                "tokenizer": {
                    "haystack_ngram_tokenizer": {
                        "type": "nGram",
                        "min_gram": 3,
                        "max_gram": 15,
                    },
                    "haystack_edgengram_tokenizer": {
                        "type": "edgeNGram",
                        "min_gram": 3,
                        "max_gram": 15,
                        "side": "front"
                    }
                },
                "filter": {
                    "haystack_ngram": {
                        "type": "nGram",
                        "min_gram": 3,
                        "max_gram": 15
                    },
                    "haystack_edgengram": {
                        "type": "edgeNGram",
                        "min_gram": 3,
                        "max_gram": 15
                    },
                    "text_ngrams": {
                        "type": "nGram",
                        "min_gram": 3,
                        "max_gram": 50
                    },
                    "text_edgengrams_front": {
                        "type": "edgeNGram",
                        "side": "front",
                        "min_gram": 3,
                        "max_gram": 50
                    },
                    "text_edgengrams_back": {
                        "type": "edgeNGram",
                        "side": "back",
                        "min_gram": 3,
                        "max_gram": 50
                    }
                }
            }
        }
    }

    def makemapping(self, index_fieldname):
        return {
            "type": "multi_field",
            "fields": {
                index_fieldname: {"type": "string",
                                  "analyzer": "partial_text",
                                  "include_in_all": True},
                "full": {"type": "string",
                         "analyzer": "full_text",
                         "include_in_all": True},
                "partial": {"type": "string",
                            "index_analyzer": "partial_text",
                            "search_analyzer": "full_text",
                            "include_in_all": True},
                "partial_front": {"type": "string",
                                  "index_analyzer": "partial_text_front",
                                  "search_analyzer": "full_text",
                                  "include_in_all": True},
                "partial_back": {"type": "string",
                                 "index_analyzer": "partial_text_back",
                                 "search_analyzer": "full_text",
                                 "include_in_all": True}
            }
        }

    def emailmapping(self, index_fieldname):
        return {
            "type": "multi_field",
            "fields": {
                index_fieldname: {"type": "string",
                                  "analyzer": "standard"},
                "keyword": {"type": "string",
                            "analyzer": "keyword",
                            "include_in_all": True},
            }
        }

    def makequery(self, param):
        fuzzy_param = param[1:-1] if len(param) > 2 else param
        query = {
            "query": {
                "bool": {
                    "should": [
                        # TODO: bei fuzzy suche funktionniert die autocompletion nicht
                        {"fuzzy_like_this": {"fields": ["text.full"], "like_text": fuzzy_param, "max_query_terms": 12}},
                        {"fuzzy": {"text": {"value": fuzzy_param, "min_similarity": 0.6}}},
                        #{"fuzzy": {"email": fuzzy_param}},
                        #{"fuzzy": {"first_name": fuzzy_param}},
                        #{"fuzzy": {"last_name": fuzzy_param}},
                        # this for the case first name is a CharField
                        #{"match": {"first_name": {"query": param, "boost": 10}}},
                        #{"match": {"last_name": {"query": param, "boost": 10}}},
                        # email
                        #{"text": {"first_name": {"boost": 5, "query": param, "type": "phrase"}}},
                        {"text": {"email": {"boost": 5, "query": param}}},
                        {"text": {"email.keyword": {"boost": 10, "query": param}}},
                        {"text": {"contact_email": {"boost": 5, "query": param}}},
                        {"text": {"contact_email.keyword": {"boost": 10, "query": param}}},
                        {"text": {"contact_email2": {"boost": 5, "query": param}}},
                        {"text": {"contact_email2.keyword": {"boost": 10, "query": param}}},
                        # first_name
                        #{"text": {"first_name": {"boost": 5, "query": param, "type": "phrase"}}},
                        {"text": {"first_name.partial": {"boost": 5, "query": param}}},
                        {"text": {"first_name.partial_front": {"boost": 10, "query": param}}},
                        #{"text": {"first_name.partial_back": {"boost": 4, "query": param}}},
                        # last_name
                        #{"text": {"last_name": {"boost": 5, "query": param, "type": "phrase"}}},
                        {"text": {"last_name.partial": {"boost": 5, "query": param}}},
                        {"text": {"last_name.partial_front": {"boost": 10, "query": param}}},
                        #{"text": {"last_name.partial_back": {"boost": 4, "query": param}}},
                        # company
                        #{"text": {"company": {"boost": 5, "query": param, "type": "phrase"}}},
                        {"text": {"company.partial": {"boost": 5, "query": param}}},
                        {"text": {"company.partial_front": {"boost": 10, "query": param}}},
                        #{"text": {"company.partial_back": {"boost": 4, "query": param}}},
                        # text
                        # ngrams with less accurate results
                        #{"text": {"text": {"boost": 1, "query": param, "type": "phrase"}}},
                        {"text": {"text.partial": {"boost": 3, "query": param, "type": "phrase"}}},
                        {"text": {"text.partial_front": {"boost": 5, "query": param, "type": "phrase"}}},
                        #{"text": {"text.partial_back": {"boost": 5, "query": param, "type": "phrase"}}}
                    ]
                }
            },
            "size": 100
        }
        return query

    def search(self, query_string, **kwargs):
        if len(query_string) == 0:
            return {
                'results': [],
                'hits': 0,
            }

        if not self.setup_complete:
            self.setup()

        search_kwargs = self.build_search_kwargs(query_string, **kwargs)
        search_kwargs['from'] = kwargs.get('start_offset', 0)

        order_fields = set()
        for order in search_kwargs.get('sort', []):
            for key in order.keys():
                order_fields.add(key)

        geo_sort = '_geo_distance' in order_fields

        end_offset = kwargs.get('end_offset')
        start_offset = kwargs.get('start_offset', 0)
        if end_offset is not None and end_offset > start_offset:
            search_kwargs['size'] = end_offset - start_offset

        try:
            raw_results = self.conn.search(search_kwargs,
                                           index=self.index_name,
                                           doc_type='modelresult')
        except (requests.RequestException, pyelasticsearch.ElasticHttpError) as e:
            if not self.silently_fail:
                raise

            self.log.error("Failed to query Elasticsearch using '%s': %s", query_string, e)
            raw_results = {}

        return self._process_results(raw_results,
            highlight=kwargs.get('highlight'),
            result_class=kwargs.get('result_class', SearchResult),
            distance_point=kwargs.get('distance_point'), geo_sort=geo_sort)

    def build_search_kwargs(self, query_string, **kwargs):
        if True:
            return self.makequery(query_string)
        else:
            # call original super:
            query = super(CustomElasticBackend, self).build_search_kwargs(query_string, **kwargs)
            return query

    def build_schema(self, fields):
        content_field_name = ''
        mapping = {}

        for field_name, field_class in fields.items():
            if field_class.field_type == 'nameword':
                field_mapping = self.makemapping(field_class.index_fieldname)
            elif field_class.field_type == 'email':
                field_mapping = self.emailmapping(field_class.index_fieldname)
            else:
                field_mapping = {
                    'boost': field_class.boost,
                    'index': 'analyzed',
                    'store': 'yes',
                    'type': 'string',
                }

                if field_class.document is True:
                    content_field_name = field_class.index_fieldname

                # DRL_FIXME: Perhaps move to something where, if none of these
                #            checks succeed, call a custom method on the form that
                #            returns, per-backend, the right type of storage?
                if field_class.field_type in ['date', 'datetime']:
                    field_mapping['type'] = 'date'
                elif field_class.field_type == 'integer':
                    field_mapping['type'] = 'long'
                elif field_class.field_type == 'float':
                    field_mapping['type'] = 'float'
                elif field_class.field_type == 'boolean':
                    field_mapping['type'] = 'boolean'
                elif field_class.field_type == 'ngram':
                    field_mapping['analyzer'] = "ngram_analyzer"
                elif field_class.field_type == 'edge_ngram':
                    field_mapping['analyzer'] = "edgengram_analyzer"
                elif field_class.field_type == 'location':
                    field_mapping['type'] = 'geo_point'

                # The docs claim nothing is needed for multivalue...
                # if field_class.is_multivalued:
                #     field_data['multi_valued'] = 'true'

                if field_class.stored is False:
                    field_mapping['store'] = 'no'

                # Do this last to override `text` fields.
                if field_class.indexed is False or hasattr(field_class, 'facet_for'):
                    field_mapping['index'] = 'not_analyzed'

                if field_mapping['type'] == 'string' and field_class.indexed:
                    field_mapping["term_vector"] = "with_positions_offsets"

                    if not hasattr(field_class, 'facet_for') and not field_class.field_type in('ngram', 'edge_ngram'):
                        field_mapping["analyzer"] = "snowball"

            mapping[field_class.index_fieldname] = field_mapping

        return (content_field_name, mapping)


class CustomElasticSearchEngine(ElasticsearchSearchEngine):
    backend = CustomElasticBackend


class NameWordField(NgramField):
    field_type = 'nameword'


class EmailField(NgramField):
    field_type = 'email'