我想通过Elasticsearch为电子邮件或电话进行模糊匹配。例如:
将所有电子邮件与@gmail.com
或
将所有电话与136
匹配。
我知道我可以使用通配符,
{
"query": {
"wildcard" : {
"email": "*gmail.com"
}
}
}
但表现非常糟糕。我试着用regexp:
{"query": {"regexp": {"email": {"value": "*163\.com*"} } } }
但是没有用。
有更好的方法吗?
curl -XGET localhost:9200 / user_data
{
"user_data": {
"aliases": {},
"mappings": {
"user_data": {
"properties": {
"address": {
"type": "string"
},
"age": {
"type": "long"
},
"comment": {
"type": "string"
},
"created_on": {
"type": "date",
"format": "dateOptionalTime"
},
"custom": {
"properties": {
"key": {
"type": "string"
},
"value": {
"type": "string"
}
}
},
"gender": {
"type": "string"
},
"name": {
"type": "string"
},
"qq": {
"type": "string"
},
"tel": {
"type": "string"
},
"updated_on": {
"type": "date",
"format": "dateOptionalTime"
},
}
}
},
"settings": {
"index": {
"creation_date": "1458832279465",
"uuid": "Fbmthc3lR0ya51zCnWidYg",
"number_of_replicas": "1",
"number_of_shards": "5",
"version": {
"created": "1070299"
}
}
},
"warmers": {}
}
}
映射:
{
"settings": {
"analysis": {
"analyzer": {
"index_phone_analyzer": {
"type": "custom",
"char_filter": [ "digit_only" ],
"tokenizer": "digit_edge_ngram_tokenizer",
"filter": [ "trim" ]
},
"search_phone_analyzer": {
"type": "custom",
"char_filter": [ "digit_only" ],
"tokenizer": "keyword",
"filter": [ "trim" ]
},
"index_email_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [ "lowercase", "name_ngram_filter", "trim" ]
},
"search_email_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [ "lowercase", "trim" ]
}
},
"char_filter": {
"digit_only": {
"type": "pattern_replace",
"pattern": "\\D+",
"replacement": ""
}
},
"tokenizer": {
"digit_edge_ngram_tokenizer": {
"type": "edgeNGram",
"min_gram": "3",
"max_gram": "15",
"token_chars": [ "digit" ]
}
},
"filter": {
"name_ngram_filter": {
"type": "ngram",
"min_gram": "3",
"max_gram": "20"
}
}
}
},
"mappings" : {
"user_data" : {
"properties" : {
"name" : {
"type" : "string",
"analyzer" : "ik"
},
"age" : {
"type" : "integer"
},
"gender": {
"type" : "string"
},
"qq" : {
"type" : "string"
},
"email" : {
"type" : "string",
"analyzer": "index_email_analyzer",
"search_analyzer": "search_email_analyzer"
},
"tel" : {
"type" : "string",
"analyzer": "index_phone_analyzer",
"search_analyzer": "search_phone_analyzer"
},
"address" : {
"type": "string",
"analyzer" : "ik"
},
"comment" : {
"type" : "string",
"analyzer" : "ik"
},
"created_on" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"updated_on" : {
"type" : "date",
"format" : "dateOptionalTime"
},
"custom": {
"type" : "nested",
"properties" : {
"key" : {
"type" : "string"
},
"value" : {
"type" : "string"
}
}
}
}
}
}
}
答案 0 :(得分:5)
一种简单的方法是创建一个自定义分析器,利用n-gram token filter电子邮件(=>见下面index_email_analyzer
和search_email_analyzer
+ email_url_analyzer
确切的电子邮件匹配)和edge-ngram token filter的电话(=>见下面的index_phone_analyzer
和search_phone_analyzer
)。
完整的索引定义可在下面找到。
PUT myindex
{
"settings": {
"analysis": {
"analyzer": {
"email_url_analyzer": {
"type": "custom",
"tokenizer": "uax_url_email",
"filter": [ "trim" ]
},
"index_phone_analyzer": {
"type": "custom",
"char_filter": [ "digit_only" ],
"tokenizer": "digit_edge_ngram_tokenizer",
"filter": [ "trim" ]
},
"search_phone_analyzer": {
"type": "custom",
"char_filter": [ "digit_only" ],
"tokenizer": "keyword",
"filter": [ "trim" ]
},
"index_email_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [ "lowercase", "name_ngram_filter", "trim" ]
},
"search_email_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": [ "lowercase", "trim" ]
}
},
"char_filter": {
"digit_only": {
"type": "pattern_replace",
"pattern": "\\D+",
"replacement": ""
}
},
"tokenizer": {
"digit_edge_ngram_tokenizer": {
"type": "edgeNGram",
"min_gram": "1",
"max_gram": "15",
"token_chars": [ "digit" ]
}
},
"filter": {
"name_ngram_filter": {
"type": "ngram",
"min_gram": "1",
"max_gram": "20"
}
}
}
},
"mappings": {
"your_type": {
"properties": {
"email": {
"type": "string",
"analyzer": "index_email_analyzer",
"search_analyzer": "search_email_analyzer"
},
"phone": {
"type": "string",
"analyzer": "index_phone_analyzer",
"search_analyzer": "search_phone_analyzer"
}
}
}
}
}
现在,让我们一个接一个地剖析它。
对于phone
字段,我们的想法是使用index_phone_analyzer
索引电话值,1362435647
使用edge-ngram标记生成器来索引电话号码的所有前缀。因此,如果您的电话号码为1
,则会生成以下令牌:13
,136
,1362
,13624
,136243
,{ {1}},1362435
,13624356
,13624356
,136243564
,1362435647
。
然后在搜索时我们使用另一个分析器search_phone_analyzer
,它只需输入输入数字(例如136
)并使用简单的phone
将其与match
字段匹配或term
查询:
POST myindex
{
"query": {
"term":
{ "phone": "136" }
}
}
对于email
字段,我们以类似的方式进行,因为我们使用index_email_analyzer
索引电子邮件值,john@gmail.com
使用ngram令牌过滤器,它将生成所有可能长度不同的令牌(可以从电子邮件值中获取1到20个字符)。例如:j
将被标记为jo
,joh
,gmail.com
,...,john@gmail.com
,...,search_email_analyzer
。
然后在搜索时,我们将使用另一个名为POST myindex
{
"query": {
"term":
{ "email": "@gmail.com" }
}
}
的分析器,它将获取输入并尝试将其与索引标记匹配。
email_url_analyzer
此示例中未使用x=set()
y=set()
b=[x,y]
win_con=({0,1,2},{3,4,5},{6,7,8},{0,4,8},{2,4,6},{0,3,6},{1,4,7},{2,5,8})
a=['']*10
def clear():
for i in range(0,10):
a[i]=i
x.clear()
y.clear()
def grid():
for x in range(0,3):
for y in range(0,3):
if y==2:
print(a[3*x+y],end='')
else:
print(a[3*x+y],'|',end='')
if x!=2:
print('\n--+--+--')
def win():
for i in range(0,2):
if b[i] in win_con:
return 1
return 0
def play():
i=0
clear()
while i<9:
grid()
t=int(input("\nPlayer 1 enters"))
a[t]='X'
x.add(t)
if win()==1:
print('Player 1 wins')
break
grid()
t=int(input("\nPlayer 2 enters"))
a[t]='0'
y.add(t)
if win()==1:
print('Player 2 wins')
break
i+=2
if win()==0:
print("Tie")
k=input('Play again?')
if k=='y':
play()
play()
分析器,但我已将其包含在内,以防您需要匹配确切的电子邮件值。