我无法弄清楚如何查找带有特殊字符的单词。
例如,我有两个文件:
1)我们正在寻找C ++和C#开发人员 2)我们正在寻找C开发人员
我只想找到一个包含C++
的文档。
创建索引,文档和搜索的代码:
from elasticsearch import Elasticsearch
from elasticsearch.helpers import scan
ELASTIC_SEARCH_NODES = ['http://localhost:9200']
INDEX = 'my_index'
DOC_TYPE = 'material'
def create_index():
data = {
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"filter": [
"lowercase"
],
"tokenizer": "whitespace",
}
}
}
}
}
print es_client.indices.create(index=INDEX, body=data)
def create_doc(body):
if es_client.exists(INDEX, DOC_TYPE, body['docid']):
es_client.delete(INDEX, DOC_TYPE, body['docid'])
print es_client.create(index=INDEX, doc_type=DOC_TYPE, body=body, id=body['docid'])
def find_doc(value):
results_generator = scan(es_client,
query={"query": {
"match_phrase" : {
"text" : value
}
}},
index=INDEX
)
return results_generator
if __name__ == '__main__':
es_client = Elasticsearch(ELASTIC_SEARCH_NODES, verify_certs=True)
# create_index()
doc1 = {"docid": 1, 'text': u"We are looking for C developers"}
doc2 = {"docid": 2, 'text': u"We are looking for C++ and C# developers"}
# create_doc(doc1)
# create_doc(doc2)
for r in find_doc("C++"):
print r
搜索结果(如果我escape +
("C\+\+"
),结果将是相同的):
{u'_score': 0.0, u'_type': u'material', u'_id': u'2', u'_source': {u'text': u'We are looking for C++ and C# developers', u'docid': 2}, u'_index': u'my_index'}
{u'_score': 0.0, u'_type': u'material', u'_id': u'1', u'_source': {u'text': u'We are looking for C developers', u'docid': 1}, u'_index': u'my_index'}
似乎获得了这样的结果,因为在划分为+
和#
not indexed等令牌符号时,事实上,它会查找有符号的文档C
:
curl 'http://localhost:9200/my_index/material/_search?pretty=true' -d '{
"query" : {
"match_all" : { }
},
"script_fields": {
"terms" : {
"script": "doc[field].values",
"params": {
"field": "text"
}
}
}
}'
结果:
{
"took" : 3,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"failed" : 0
},
"hits" : {
"total" : 2,
"max_score" : 1.0,
"hits" : [ {
"_index" : "my_index",
"_type" : "material",
"_id" : "2",
"_score" : 1.0,
"fields" : {
"terms" : [ "and", "are", "c", "developers", "for", "looking", "we" ]
}
}, {
"_index" : "my_index",
"_type" : "material",
"_id" : "1",
"_score" : 1.0,
"fields" : {
"terms" : [ "are", "c", "developers", "for", "looking", "we" ]
}
}]
}
}
如何解决这个问题?与上一个问题相关的第二个问题是:是否可以仅搜索非%
或+
等非字母数字字符?
P.S。我使用的是Elastic 2.3.2和elasticsearch = 2.3.0。
答案 0 :(得分:0)
谢谢Andrew,我解决了这个问题。问题是标准分析器用于索引,而不是 my_analyzer 。因此,我忘了使用映射。正确的版本:
data = {
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"filter": [
"lowercase"
],
"tokenizer": "whitespace",
}
}
}
},
"mappings": {
"material": {
"properties": {
"docid": {
"type": "integer"
},
"text": {
"type": "string",
"analyzer": "my_analyzer"
}
}
}
}
}
此外,还需要重新创建索引并添加文档。
要搜索特殊字符,我使用query_string。代码find_doc
功能:
def find_doc(value):
results_generator = scan(es_client,
query=
{
"query": {
"filtered" : {
"query" : {
"query_string" : {
"query": value,
"fields" : ["text"],
"analyzer": ANALYZER,
"default_operator": "AND"
},
}
}
}
},
index=INDEX
)
return results_generator
查询示例(现在可以使用wildcard-characters):
for r in find_doc("*#"):
print r
for r in find_doc(u"%"):
print r
for r in find_doc("looking fo*"):
print r
请求验证分析器(令牌字符串被破坏):
curl -XPOST "http://localhost:9200/my_index/_analyze?analyzer=my_analyzer&pretty=true" -d 'We are looking for C++ and C# developers'