所以我用这个“教程”开始使用elasticsearch中的特殊字符:https://www.elastic.co/guide/en/elasticsearch/guide/current/case-folding.html
我为elasticsearch 1.7.x版安装了elasticsearch-analysis-icu版本2.7.0
使用icu_tokenizer创建索引“sonderzeichen”工作正常(我使用nodejs):
var http = require('http');
var body = JSON.stringify(
{
"settings": {
"analysis": {
"analyzer": {
"my_lowercaser": {
"tokenizer": "icu_tokenizer",
"filter": [ "icu_normalizer" ]
}
}
}
}
}
);
var options = {
host: 'localhost',
path: '/sonderzeichen',
port: 9200,
method: "PUT",
headers: {
'Content-Type': 'application/json',
'Content-Length': body.length
}
};
callback = function(response) {
var str = '';
response.on('data', function(chunk){
str += chunk;
});
response.on('end', function(){
console.log(str);
});
};
http.request(options, callback).end(body);
我使用了教程中描述的两个分析器:
/_analyze?analyzer=my_lowercaser
和
/sonderzeichen/_analyze?analyzer=my_lowercaser
在节点中,它看起来像这样:
var http = require('http');
var body = decodeURIComponent("Weißkopfseeadler WEISSKOPFSEEADLER äÄöÖüÜßáÁéÉíÍóÓúÚàÀèÈìÌòÒùÙ");
var options = {
host: 'localhost',
path: '/_analyze?analyzer=standard',
port: 9200,
method: "GET",
headers: {
'Content-Type': 'application/json',
'Content-Length': body.length
}
};
callback = function(response) {
var str = '';
response.on('data', function(chunk){
str += chunk;
});
response.on('end', function(){
console.log(str);
});
};
http.request(options, callback).end(body);
两者都返回完全相同的标记,如下所示(无论我是否使用decodeURIComponent):
{
"tokens": [
{
"token": "wei",
"start_offset": 0,
"end_offset": 3,
"type": "<ALPHANUM>",
"position": 1
},
{
"token": "kopfseeadler",
"start_offset": 4,
"end_offset": 16,
"type": "<ALPHANUM>",
"position": 2
},
{
"token": "weisskopfseeadler",
"start_offset": 17,
"end_offset": 34,
"type": "<ALPHANUM>",
"position": 3
}
]
}
弹性仍然无法处理任何特殊字符,所以我哪里出错了?
答案 0 :(得分:0)
我使用以下设置创建了sonderzeichen
索引:
curl -XPUT localhost:9200/sonderzeichen -d '{
"settings": {
"analysis": {
"analyzer": {
"default": {
"tokenizer": "standard",
"filter": [
"standard",
"asciifolding"
]
}
}
}
}
}'
完成后,我已经分析了你在问题中提到的输入字符串,如下所示:
curl -XGET 'localhost:9200/sonderzeichen/_analyze?analyzer=default&pretty' -d 'Weißkopfseeadler WEISSKOPFSEEADLER äÄöÖüÜßáÁéÉíÍóÓúÚàÀèÈìÌòÒùÙ'
我得到的输出是下面的那个,看起来对我来说是正确的
{
"tokens" : [ {
"token" : "Weisskopfseeadler",
"start_offset" : 0,
"end_offset" : 16,
"type" : "<ALPHANUM>",
"position" : 1
}, {
"token" : "WEISSKOPFSEEADLER",
"start_offset" : 17,
"end_offset" : 34,
"type" : "<ALPHANUM>",
"position" : 2
}, {
"token" : "aAoOuUssaAeEiIoOuUaAeEiIoOuU",
"start_offset" : 35,
"end_offset" : 62,
"type" : "<ALPHANUM>",
"position" : 3
} ]
}
答案 1 :(得分:0)
我们使用node -s的elastic-api修复它。出于某种原因,通过节点发布http请求将返回mapperParsing异常! curl将工作并使用elasticsearch.Client将工作。 因此,以下用于编写特殊字符的版本(在这种情况下甚至不使用任何标记器或分析器)在我们的弹性环境中工作:
var elasticsearch = require('elasticsearch');
var client = new elasticsearch.Client({
host: 'localhost:9200',
log: ''
});
client.index({
index: "sonderzeichen",
type: "randomType",
id: "randomId",
body: "Weißkopfseeadler WEISSKOPFSEEADLER äÄöÖüÜßáÁéÉíÍóÓúÚàÀèÈìÌòÒùÙ"
}, function (err, resp) {
if (err){
console.error("error in method writeDB: " + err);
return;
}
console.log("callback from db request: " + JSON.stringify(resp));
});