Elasticsearch - 使用ngrams作为标记器和过滤器提供不同的输出

时间:2015-11-02 11:20:02

标签: elasticsearch

有人可以解释为什么使用ngrams作为tokenzier会比将其用作过滤器时提供不同的输出。例如,使用它作为"扑热息痛"的标记器。我明白了:

{
   "tokens": [
      {
         "token": "par",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "para",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "parac",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "parace",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "paracet",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "paraceta",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "paracetam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "paracetamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "paracetamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "ara",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "arac",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "arace",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "aracet",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "araceta",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "aracetam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "aracetamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "aracetamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "rac",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "race",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "racet",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "raceta",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "racetam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "racetamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "racetamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "ace",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "acet",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "aceta",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "acetam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "acetamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "acetamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "cet",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "ceta",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "cetam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "cetamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "cetamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "eta",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "etam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "etamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "etamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "tam",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "tamo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "tamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "amo",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "amol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      },
      {
         "token": "mol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 1
      }
   ]
}

使用它作为过滤器我得到:

{
   "tokens": [
      {
         "token": "par",
         "start_offset": 0,
         "end_offset": 3,
         "type": "word",
         "position": 1
      },
      {
         "token": "para",
         "start_offset": 0,
         "end_offset": 4,
         "type": "word",
         "position": 2
      },
      {
         "token": "parac",
         "start_offset": 0,
         "end_offset": 5,
         "type": "word",
         "position": 3
      },
      {
         "token": "parace",
         "start_offset": 0,
         "end_offset": 6,
         "type": "word",
         "position": 4
      },
      {
         "token": "paracet",
         "start_offset": 0,
         "end_offset": 7,
         "type": "word",
         "position": 5
      },
      {
         "token": "paraceta",
         "start_offset": 0,
         "end_offset": 8,
         "type": "word",
         "position": 6
      },
      {
         "token": "paracetam",
         "start_offset": 0,
         "end_offset": 9,
         "type": "word",
         "position": 7
      },
      {
         "token": "paracetamo",
         "start_offset": 0,
         "end_offset": 10,
         "type": "word",
         "position": 8
      },
      {
         "token": "paracetamol",
         "start_offset": 0,
         "end_offset": 11,
         "type": "word",
         "position": 9
      },
      {
         "token": "ara",
         "start_offset": 1,
         "end_offset": 4,
         "type": "word",
         "position": 10
      },
      {
         "token": "arac",
         "start_offset": 1,
         "end_offset": 5,
         "type": "word",
         "position": 11
      },
      {
         "token": "arace",
         "start_offset": 1,
         "end_offset": 6,
         "type": "word",
         "position": 12
      },
      {
         "token": "aracet",
         "start_offset": 1,
         "end_offset": 7,
         "type": "word",
         "position": 13
      },
      {
         "token": "araceta",
         "start_offset": 1,
         "end_offset": 8,
         "type": "word",
         "position": 14
      },
      {
         "token": "aracetam",
         "start_offset": 1,
         "end_offset": 9,
         "type": "word",
         "position": 15
      },
      {
         "token": "aracetamo",
         "start_offset": 1,
         "end_offset": 10,
         "type": "word",
         "position": 16
      },
      {
         "token": "aracetamol",
         "start_offset": 1,
         "end_offset": 11,
         "type": "word",
         "position": 17
      },
      {
         "token": "rac",
         "start_offset": 2,
         "end_offset": 5,
         "type": "word",
         "position": 18
      },
      {
         "token": "race",
         "start_offset": 2,
         "end_offset": 6,
         "type": "word",
         "position": 19
      },
      {
         "token": "racet",
         "start_offset": 2,
         "end_offset": 7,
         "type": "word",
         "position": 20
      },
      {
         "token": "raceta",
         "start_offset": 2,
         "end_offset": 8,
         "type": "word",
         "position": 21
      },
      {
         "token": "racetam",
         "start_offset": 2,
         "end_offset": 9,
         "type": "word",
         "position": 22
      },
      {
         "token": "racetamo",
         "start_offset": 2,
         "end_offset": 10,
         "type": "word",
         "position": 23
      },
      {
         "token": "racetamol",
         "start_offset": 2,
         "end_offset": 11,
         "type": "word",
         "position": 24
      },
      {
         "token": "ace",
         "start_offset": 3,
         "end_offset": 6,
         "type": "word",
         "position": 25
      },
      {
         "token": "acet",
         "start_offset": 3,
         "end_offset": 7,
         "type": "word",
         "position": 26
      },
      {
         "token": "aceta",
         "start_offset": 3,
         "end_offset": 8,
         "type": "word",
         "position": 27
      },
      {
         "token": "acetam",
         "start_offset": 3,
         "end_offset": 9,
         "type": "word",
         "position": 28
      },
      {
         "token": "acetamo",
         "start_offset": 3,
         "end_offset": 10,
         "type": "word",
         "position": 29
      },
      {
         "token": "acetamol",
         "start_offset": 3,
         "end_offset": 11,
         "type": "word",
         "position": 30
      },
      {
         "token": "cet",
         "start_offset": 4,
         "end_offset": 7,
         "type": "word",
         "position": 31
      },
      {
         "token": "ceta",
         "start_offset": 4,
         "end_offset": 8,
         "type": "word",
         "position": 32
      },
      {
         "token": "cetam",
         "start_offset": 4,
         "end_offset": 9,
         "type": "word",
         "position": 33
      },
      {
         "token": "cetamo",
         "start_offset": 4,
         "end_offset": 10,
         "type": "word",
         "position": 34
      },
      {
         "token": "cetamol",
         "start_offset": 4,
         "end_offset": 11,
         "type": "word",
         "position": 35
      },
      {
         "token": "eta",
         "start_offset": 5,
         "end_offset": 8,
         "type": "word",
         "position": 36
      },
      {
         "token": "etam",
         "start_offset": 5,
         "end_offset": 9,
         "type": "word",
         "position": 37
      },
      {
         "token": "etamo",
         "start_offset": 5,
         "end_offset": 10,
         "type": "word",
         "position": 38
      },
      {
         "token": "etamol",
         "start_offset": 5,
         "end_offset": 11,
         "type": "word",
         "position": 39
      },
      {
         "token": "tam",
         "start_offset": 6,
         "end_offset": 9,
         "type": "word",
         "position": 40
      },
      {
         "token": "tamo",
         "start_offset": 6,
         "end_offset": 10,
         "type": "word",
         "position": 41
      },
      {
         "token": "tamol",
         "start_offset": 6,
         "end_offset": 11,
         "type": "word",
         "position": 42
      },
      {
         "token": "amo",
         "start_offset": 7,
         "end_offset": 10,
         "type": "word",
         "position": 43
      },
      {
         "token": "amol",
         "start_offset": 7,
         "end_offset": 11,
         "type": "word",
         "position": 44
      },
      {
         "token": "mol",
         "start_offset": 8,
         "end_offset": 11,
         "type": "word",
         "position": 45
      }
   ]
}

1 个答案:

答案 0 :(得分:0)

这两种方法可能会产生相同的产出 但根据具体情况,一种方法可能比另一方更好 如果您的搜索术语中需要特殊字符,则可能需要在映射中使用ngram tokenizer。知道如何使用两者是有用的 Reference