我已经设置了一个Djangoproject,现在想要将haystack和elasticsearch作为后端包含非数据库搜索。在索引中,我使用EdgeNgramFields作为文本。一切正常,但搜索结果太多了。我已阅读http://django-haystack.readthedocs.org/en/latest/上的文档,但无法找到设置相关性选项的可能性。
在关于Boost的部分中,他们正在谈论得分。基本上我希望能够定义最低分。但我无法找到结果的现场得分的解释。
我错过了什么?是否有可能做的事情,我在谈论什么?
答案 0 :(得分:1)
在不知道确切的映射和样本数据的情况下,很难告诉您为什么搜索会返回太多结果。但是,我假设你的edgengram标记器使用一个非常小的起始大小的子串,如1或2.使用这样的设置有很多匹配,例如如果你有以下短语,起始大小为1:
a quick brown fox
它将通过以下方式标记:
a q qu qui quick b br bro brow brown f fo fox
这可能会为查询产生大量匹配。作为解决方案,您可以使用另一个起始大小和模糊搜索来查找类似的结果。
但首先,请提供您的确切数据映射,示例数据和查询。
下面是一个示例自定义后端。关键部分是自定义配置部分底部的自定义类型和build_schema函数。
示例自定义后端配置:
HAYSTACK_CONNECTIONS = {
'default': {
'ENGINE': 'myservice.apps.search.search_backends.CustomElasticSearchEngine',
'URL': 'http://127.0.0.1:9200/',
'INDEX_NAME': 'haystack_prod',
'TIMEOUT': 60,
},
}
示例自定义后端:
from django.conf import settings
from haystack.backends.elasticsearch_backend import ElasticsearchSearchBackend, ElasticsearchSearchEngine
from haystack.fields import NgramField
from haystack.models import SearchResult
import requests
import pyelasticsearch
class CustomElasticBackend(ElasticsearchSearchBackend):
#DEFAULT_ANALYZER = "snowball"
DEFAULT_SETTINGS = {
'settings': {
"analysis": {
"analyzer": {
"ngram_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["haystack_ngram"]
},
"edgengram_analyzer": {
"type": "custom",
"tokenizer": "lowercase",
"filter": ["lowercase", "asciifolding", "haystack_edgengram"]
},
"full_text": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "lowercase", "asciifolding"]
},
"partial_text": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "lowercase", "asciifolding", "text_ngrams"]
},
"partial_text_front": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "lowercase", "asciifolding", "text_edgengrams_front"]
},
"partial_text_back": {
"type": "custom",
"tokenizer": "standard",
"filter": ["standard", "lowercase", "asciifolding", "text_edgengrams_back"]
}
},
"tokenizer": {
"haystack_ngram_tokenizer": {
"type": "nGram",
"min_gram": 3,
"max_gram": 15,
},
"haystack_edgengram_tokenizer": {
"type": "edgeNGram",
"min_gram": 3,
"max_gram": 15,
"side": "front"
}
},
"filter": {
"haystack_ngram": {
"type": "nGram",
"min_gram": 3,
"max_gram": 15
},
"haystack_edgengram": {
"type": "edgeNGram",
"min_gram": 3,
"max_gram": 15
},
"text_ngrams": {
"type": "nGram",
"min_gram": 3,
"max_gram": 50
},
"text_edgengrams_front": {
"type": "edgeNGram",
"side": "front",
"min_gram": 3,
"max_gram": 50
},
"text_edgengrams_back": {
"type": "edgeNGram",
"side": "back",
"min_gram": 3,
"max_gram": 50
}
}
}
}
}
def makemapping(self, index_fieldname):
return {
"type": "multi_field",
"fields": {
index_fieldname: {"type": "string",
"analyzer": "partial_text",
"include_in_all": True},
"full": {"type": "string",
"analyzer": "full_text",
"include_in_all": True},
"partial": {"type": "string",
"index_analyzer": "partial_text",
"search_analyzer": "full_text",
"include_in_all": True},
"partial_front": {"type": "string",
"index_analyzer": "partial_text_front",
"search_analyzer": "full_text",
"include_in_all": True},
"partial_back": {"type": "string",
"index_analyzer": "partial_text_back",
"search_analyzer": "full_text",
"include_in_all": True}
}
}
def emailmapping(self, index_fieldname):
return {
"type": "multi_field",
"fields": {
index_fieldname: {"type": "string",
"analyzer": "standard"},
"keyword": {"type": "string",
"analyzer": "keyword",
"include_in_all": True},
}
}
def makequery(self, param):
fuzzy_param = param[1:-1] if len(param) > 2 else param
query = {
"query": {
"bool": {
"should": [
# TODO: bei fuzzy suche funktionniert die autocompletion nicht
{"fuzzy_like_this": {"fields": ["text.full"], "like_text": fuzzy_param, "max_query_terms": 12}},
{"fuzzy": {"text": {"value": fuzzy_param, "min_similarity": 0.6}}},
#{"fuzzy": {"email": fuzzy_param}},
#{"fuzzy": {"first_name": fuzzy_param}},
#{"fuzzy": {"last_name": fuzzy_param}},
# this for the case first name is a CharField
#{"match": {"first_name": {"query": param, "boost": 10}}},
#{"match": {"last_name": {"query": param, "boost": 10}}},
# email
#{"text": {"first_name": {"boost": 5, "query": param, "type": "phrase"}}},
{"text": {"email": {"boost": 5, "query": param}}},
{"text": {"email.keyword": {"boost": 10, "query": param}}},
{"text": {"contact_email": {"boost": 5, "query": param}}},
{"text": {"contact_email.keyword": {"boost": 10, "query": param}}},
{"text": {"contact_email2": {"boost": 5, "query": param}}},
{"text": {"contact_email2.keyword": {"boost": 10, "query": param}}},
# first_name
#{"text": {"first_name": {"boost": 5, "query": param, "type": "phrase"}}},
{"text": {"first_name.partial": {"boost": 5, "query": param}}},
{"text": {"first_name.partial_front": {"boost": 10, "query": param}}},
#{"text": {"first_name.partial_back": {"boost": 4, "query": param}}},
# last_name
#{"text": {"last_name": {"boost": 5, "query": param, "type": "phrase"}}},
{"text": {"last_name.partial": {"boost": 5, "query": param}}},
{"text": {"last_name.partial_front": {"boost": 10, "query": param}}},
#{"text": {"last_name.partial_back": {"boost": 4, "query": param}}},
# company
#{"text": {"company": {"boost": 5, "query": param, "type": "phrase"}}},
{"text": {"company.partial": {"boost": 5, "query": param}}},
{"text": {"company.partial_front": {"boost": 10, "query": param}}},
#{"text": {"company.partial_back": {"boost": 4, "query": param}}},
# text
# ngrams with less accurate results
#{"text": {"text": {"boost": 1, "query": param, "type": "phrase"}}},
{"text": {"text.partial": {"boost": 3, "query": param, "type": "phrase"}}},
{"text": {"text.partial_front": {"boost": 5, "query": param, "type": "phrase"}}},
#{"text": {"text.partial_back": {"boost": 5, "query": param, "type": "phrase"}}}
]
}
},
"size": 100
}
return query
def search(self, query_string, **kwargs):
if len(query_string) == 0:
return {
'results': [],
'hits': 0,
}
if not self.setup_complete:
self.setup()
search_kwargs = self.build_search_kwargs(query_string, **kwargs)
search_kwargs['from'] = kwargs.get('start_offset', 0)
order_fields = set()
for order in search_kwargs.get('sort', []):
for key in order.keys():
order_fields.add(key)
geo_sort = '_geo_distance' in order_fields
end_offset = kwargs.get('end_offset')
start_offset = kwargs.get('start_offset', 0)
if end_offset is not None and end_offset > start_offset:
search_kwargs['size'] = end_offset - start_offset
try:
raw_results = self.conn.search(search_kwargs,
index=self.index_name,
doc_type='modelresult')
except (requests.RequestException, pyelasticsearch.ElasticHttpError) as e:
if not self.silently_fail:
raise
self.log.error("Failed to query Elasticsearch using '%s': %s", query_string, e)
raw_results = {}
return self._process_results(raw_results,
highlight=kwargs.get('highlight'),
result_class=kwargs.get('result_class', SearchResult),
distance_point=kwargs.get('distance_point'), geo_sort=geo_sort)
def build_search_kwargs(self, query_string, **kwargs):
if True:
return self.makequery(query_string)
else:
# call original super:
query = super(CustomElasticBackend, self).build_search_kwargs(query_string, **kwargs)
return query
def build_schema(self, fields):
content_field_name = ''
mapping = {}
for field_name, field_class in fields.items():
if field_class.field_type == 'nameword':
field_mapping = self.makemapping(field_class.index_fieldname)
elif field_class.field_type == 'email':
field_mapping = self.emailmapping(field_class.index_fieldname)
else:
field_mapping = {
'boost': field_class.boost,
'index': 'analyzed',
'store': 'yes',
'type': 'string',
}
if field_class.document is True:
content_field_name = field_class.index_fieldname
# DRL_FIXME: Perhaps move to something where, if none of these
# checks succeed, call a custom method on the form that
# returns, per-backend, the right type of storage?
if field_class.field_type in ['date', 'datetime']:
field_mapping['type'] = 'date'
elif field_class.field_type == 'integer':
field_mapping['type'] = 'long'
elif field_class.field_type == 'float':
field_mapping['type'] = 'float'
elif field_class.field_type == 'boolean':
field_mapping['type'] = 'boolean'
elif field_class.field_type == 'ngram':
field_mapping['analyzer'] = "ngram_analyzer"
elif field_class.field_type == 'edge_ngram':
field_mapping['analyzer'] = "edgengram_analyzer"
elif field_class.field_type == 'location':
field_mapping['type'] = 'geo_point'
# The docs claim nothing is needed for multivalue...
# if field_class.is_multivalued:
# field_data['multi_valued'] = 'true'
if field_class.stored is False:
field_mapping['store'] = 'no'
# Do this last to override `text` fields.
if field_class.indexed is False or hasattr(field_class, 'facet_for'):
field_mapping['index'] = 'not_analyzed'
if field_mapping['type'] == 'string' and field_class.indexed:
field_mapping["term_vector"] = "with_positions_offsets"
if not hasattr(field_class, 'facet_for') and not field_class.field_type in('ngram', 'edge_ngram'):
field_mapping["analyzer"] = "snowball"
mapping[field_class.index_fieldname] = field_mapping
return (content_field_name, mapping)
class CustomElasticSearchEngine(ElasticsearchSearchEngine):
backend = CustomElasticBackend
class NameWordField(NgramField):
field_type = 'nameword'
class EmailField(NgramField):
field_type = 'email'