如何通过Elasticsearch模糊匹配电子邮件或电话?

时间:2016-03-24 11:52:29

标签: mysql elasticsearch

我想通过Elasticsearch为电子邮件或电话进行模糊匹配。例如:

将所有电子邮件与@gmail.com

匹配

将所有电话与136匹配。

我知道我可以使用通配符,

{
 "query": {
    "wildcard" : {
      "email": "*gmail.com"
    }
  }
}

但表现非常糟糕。我试着用regexp:

{"query": {"regexp": {"email": {"value": "*163\.com*"} } } }

但是没有用。

有更好的方法吗?

  

curl -XGET localhost:9200 / user_data

{
    "user_data": {
        "aliases": {},
        "mappings": {
            "user_data": {
                "properties": {
                    "address": {
                        "type": "string"
                    },
                    "age": {
                        "type": "long"
                    },
                    "comment": {
                        "type": "string"
                    },
                    "created_on": {
                        "type": "date",
                        "format": "dateOptionalTime"
                    },
                    "custom": {
                        "properties": {
                            "key": {
                                "type": "string"
                            },
                            "value": {
                                "type": "string"
                            }
                        }
                    },
                    "gender": {
                        "type": "string"
                    },
                    "name": {
                        "type": "string"
                    },
                    "qq": {
                        "type": "string"
                    },
                    "tel": {
                        "type": "string"
                    },
                    "updated_on": {
                        "type": "date",
                        "format": "dateOptionalTime"
                    },
                }
            }
        },
        "settings": {
            "index": {
                "creation_date": "1458832279465",
                "uuid": "Fbmthc3lR0ya51zCnWidYg",
                "number_of_replicas": "1",
                "number_of_shards": "5",
                "version": {
                    "created": "1070299"
                }
            }
        },
        "warmers": {}
    }
}

映射:

{
  "settings": {
    "analysis": {
      "analyzer": {
        "index_phone_analyzer": {
          "type": "custom",
          "char_filter": [ "digit_only" ],
          "tokenizer": "digit_edge_ngram_tokenizer",
          "filter": [ "trim" ]
        },
        "search_phone_analyzer": {
          "type": "custom",
          "char_filter": [ "digit_only" ],
          "tokenizer": "keyword",
          "filter": [ "trim" ]
        },
        "index_email_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [ "lowercase", "name_ngram_filter", "trim" ]
        },
        "search_email_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [ "lowercase", "trim" ]
        }
      },
      "char_filter": {
        "digit_only": {
          "type": "pattern_replace",
          "pattern": "\\D+",
          "replacement": ""
        }
      },
      "tokenizer": {
        "digit_edge_ngram_tokenizer": {
          "type": "edgeNGram",
          "min_gram": "3",
          "max_gram": "15",
          "token_chars": [ "digit" ]
        }
      },
      "filter": {
        "name_ngram_filter": {
          "type": "ngram",
          "min_gram": "3",
          "max_gram": "20"
        }
      }
    }
  },
  "mappings" : {
    "user_data" : {
      "properties" : {
        "name" : {
          "type" : "string",
          "analyzer" : "ik"
        },
        "age" : {
          "type" : "integer"
        },
        "gender": {
          "type" : "string"
        },
        "qq" : {
          "type" : "string"
        },
        "email" : {
          "type" : "string",
          "analyzer": "index_email_analyzer",
          "search_analyzer": "search_email_analyzer"
        },
        "tel" : {
          "type" : "string",
          "analyzer": "index_phone_analyzer",
          "search_analyzer": "search_phone_analyzer"
        },
        "address" : {
          "type": "string",
          "analyzer" : "ik"
        },
        "comment" : {
          "type" : "string",
          "analyzer" : "ik"
        },
        "created_on" : {
          "type" : "date",
          "format" : "dateOptionalTime"
        },
        "updated_on" : {
          "type" : "date",
          "format" : "dateOptionalTime"
        },
        "custom": {
          "type" : "nested",
          "properties" : {
            "key" : {
              "type" : "string"
            },
            "value" : {
              "type" : "string"
            }
          }
        }
      }
    }
  }
}

1 个答案:

答案 0 :(得分:5)

一种简单的方法是创建一个自定义分析器,利用n-gram token filter电子邮件(=>见下面index_email_analyzersearch_email_analyzer + email_url_analyzer确切的电子邮件匹配)和edge-ngram token filter的电话(=>见下面的index_phone_analyzersearch_phone_analyzer)。

完整的索引定义可在下面找到。

PUT myindex
{
  "settings": {
    "analysis": {
      "analyzer": {
        "email_url_analyzer": {
          "type": "custom",
          "tokenizer": "uax_url_email",
          "filter": [ "trim" ]
        },
        "index_phone_analyzer": {
          "type": "custom",
          "char_filter": [ "digit_only" ],
          "tokenizer": "digit_edge_ngram_tokenizer",
          "filter": [ "trim" ]
        },
        "search_phone_analyzer": {
          "type": "custom",
          "char_filter": [ "digit_only" ],
          "tokenizer": "keyword",
          "filter": [ "trim" ]
        },
        "index_email_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [ "lowercase", "name_ngram_filter", "trim" ]
        },
        "search_email_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": [ "lowercase", "trim" ]
        }
      },
      "char_filter": {
        "digit_only": {
          "type": "pattern_replace",
          "pattern": "\\D+",
          "replacement": ""
        }
      },
      "tokenizer": {
        "digit_edge_ngram_tokenizer": {
          "type": "edgeNGram",
          "min_gram": "1",
          "max_gram": "15",
          "token_chars": [ "digit" ]
        }
      },
      "filter": {
        "name_ngram_filter": {
          "type": "ngram",
          "min_gram": "1",
          "max_gram": "20"
        }
      }
    }
  },
  "mappings": {
    "your_type": {
      "properties": {
        "email": {
          "type": "string",
          "analyzer": "index_email_analyzer",
          "search_analyzer": "search_email_analyzer"
        },
        "phone": {
          "type": "string",
          "analyzer": "index_phone_analyzer",
          "search_analyzer": "search_phone_analyzer"
        }
      }
    }
  }
}

现在,让我们一个接一个地剖析它。

对于phone字段,我们的想法是使用index_phone_analyzer索引电话值,1362435647使用edge-ngram标记生成器来索引电话号码的所有前缀。因此,如果您的电话号码为1,则会生成以下令牌:13136136213624136243,{ {1}},136243513624356136243561362435641362435647

然后在搜索时我们使用另一个分析器search_phone_analyzer,它只需输入输入数字(例如136)并使用简单的phone将其与match字段匹配或term查询:

POST myindex
{ 
    "query": {
        "term": 
            { "phone": "136" }
    }
}

对于email字段,我们以类似的方式进行,因为我们使用index_email_analyzer索引电子邮件值,john@gmail.com使用ngram令牌过滤器,它将生成所有可能长度不同的令牌(可以从电子邮件值中获取1到20个字符)。例如:j将被标记为jojohgmail.com,...,john@gmail.com,...,search_email_analyzer

然后在搜索时,我们将使用另一个名为POST myindex { "query": { "term": { "email": "@gmail.com" } } } 的分析器,它将获取输入并尝试将其与索引标记匹配。

email_url_analyzer

此示例中未使用x=set() y=set() b=[x,y] win_con=({0,1,2},{3,4,5},{6,7,8},{0,4,8},{2,4,6},{0,3,6},{1,4,7},{2,5,8}) a=['']*10 def clear(): for i in range(0,10): a[i]=i x.clear() y.clear() def grid(): for x in range(0,3): for y in range(0,3): if y==2: print(a[3*x+y],end='') else: print(a[3*x+y],'|',end='') if x!=2: print('\n--+--+--') def win(): for i in range(0,2): if b[i] in win_con: return 1 return 0 def play(): i=0 clear() while i<9: grid() t=int(input("\nPlayer 1 enters")) a[t]='X' x.add(t) if win()==1: print('Player 1 wins') break grid() t=int(input("\nPlayer 2 enters")) a[t]='0' y.add(t) if win()==1: print('Player 2 wins') break i+=2 if win()==0: print("Tie") k=input('Play again?') if k=='y': play() play() 分析器,但我已将其包含在内,以防您需要匹配确切的电子邮件值。