Elasticsearch:ICU插件的特殊字符

时间:2016-01-15 12:21:02

标签: node.js elasticsearch icu

所以我用这个“教程”开始使用elasticsearch中的特殊字符:https://www.elastic.co/guide/en/elasticsearch/guide/current/case-folding.html

我为elasticsearch 1.7.x版安装了elasticsearch-analysis-icu版本2.7.0

使用icu_tokenizer创建索引“sonderzeichen”工作正常(我使用nodejs):

    var http = require('http');
    var body = JSON.stringify(
            {
                "settings": {
                    "analysis": {
                        "analyzer": {
                            "my_lowercaser": {
                                "tokenizer": "icu_tokenizer",
                                "filter":  [ "icu_normalizer" ] 
                            }
                        }
                    }
                }
            }
    );

    var options = {
            host: 'localhost',
            path: '/sonderzeichen',
            port: 9200,
            method: "PUT",
            headers: {
                'Content-Type': 'application/json',
                'Content-Length': body.length
            }
    };

    callback = function(response) {
        var str = '';
        response.on('data', function(chunk){
            str += chunk;
        });

        response.on('end', function(){
            console.log(str);
        });
    };


    http.request(options, callback).end(body);

我使用了教程中描述的两个分析器:

/_analyze?analyzer=my_lowercaser

/sonderzeichen/_analyze?analyzer=my_lowercaser

在节点中,它看起来像这样:

    var http = require('http');

    var body = decodeURIComponent("Weißkopfseeadler WEISSKOPFSEEADLER äÄöÖüÜßáÁéÉíÍóÓúÚàÀèÈìÌòÒùÙ");

    var options = {
            host: 'localhost',
            path: '/_analyze?analyzer=standard',
            port: 9200,
            method: "GET",
            headers: {
                'Content-Type': 'application/json',
                'Content-Length': body.length
            }
    };

    callback = function(response) {
        var str = '';
        response.on('data', function(chunk){
            str += chunk;
        });

        response.on('end', function(){
            console.log(str);
        });
    };


    http.request(options, callback).end(body);

两者都返回完全相同的标记,如下所示(无论我是否使用decodeURIComponent):

    {
      "tokens": [
        {
          "token": "wei",
          "start_offset": 0,
          "end_offset": 3,
          "type": "<ALPHANUM>",
          "position": 1
        },
        {
          "token": "kopfseeadler",
          "start_offset": 4,
          "end_offset": 16,
          "type": "<ALPHANUM>",
          "position": 2
        },
        {
          "token": "weisskopfseeadler",
          "start_offset": 17,
          "end_offset": 34,
          "type": "<ALPHANUM>",
          "position": 3
        }
      ]
    }

弹性仍然无法处理任何特殊字符,所以我哪里出错了?

2 个答案:

答案 0 :(得分:0)

我使用以下设置创建了sonderzeichen索引:

curl -XPUT localhost:9200/sonderzeichen -d '{
  "settings": {
    "analysis": {
      "analyzer": {
        "default": {
          "tokenizer": "standard",
          "filter": [
            "standard",
            "asciifolding"
          ]
        }
      }
    }
  }
}'

完成后,我已经分析了你在问题中提到的输入字符串,如下所示:

 curl -XGET 'localhost:9200/sonderzeichen/_analyze?analyzer=default&pretty' -d 'Weißkopfseeadler WEISSKOPFSEEADLER äÄöÖüÜßáÁéÉíÍóÓúÚàÀèÈìÌòÒùÙ'

我得到的输出是下面的那个,看起来对我来说是正确的

{
  "tokens" : [ {
    "token" : "Weisskopfseeadler",
    "start_offset" : 0,
    "end_offset" : 16,
    "type" : "<ALPHANUM>",
    "position" : 1
  }, {
    "token" : "WEISSKOPFSEEADLER",
    "start_offset" : 17,
    "end_offset" : 34,
    "type" : "<ALPHANUM>",
    "position" : 2
  }, {
    "token" : "aAoOuUssaAeEiIoOuUaAeEiIoOuU",
    "start_offset" : 35,
    "end_offset" : 62,
    "type" : "<ALPHANUM>",
    "position" : 3
  } ]
}

答案 1 :(得分:0)

我们使用node -s的elastic-api修复它。出于某种原因,通过节点发布http请求将返回mapperParsing异常! curl将工作并使用elasticsearch.Client将工作。 因此,以下用于编写特殊字符的版本(在这种情况下甚至不使用任何标记器或分析器)在我们的弹性环境中工作:

    var elasticsearch = require('elasticsearch');
    var client = new elasticsearch.Client({
        host: 'localhost:9200',
        log: ''
    });

    client.index({
        index: "sonderzeichen",
        type: "randomType",
        id: "randomId",
        body: "Weißkopfseeadler WEISSKOPFSEEADLER äÄöÖüÜßáÁéÉíÍóÓúÚàÀèÈìÌòÒùÙ"
    }, function (err, resp) {
        if (err){
                console.error("error in method writeDB: " + err);
                return;
        }
        console.log("callback from db request: " + JSON.stringify(resp));
    });