编辑：

Question

我的文档上有一些荷兰分析器的搜索索引，效果很好。例如，考虑：

http://wetten.cloudant.com/regelingen/_design/RegelingInfo/_search/regeling?q=burgerlijke

当我试图让我的搜索模糊时，一切都会出错：

http://wetten.cloudant.com/regelingen/_design/RegelingInfo/_search/regeling?q=burgerlijke~

突然返回0结果。怎么会这样？

编辑：

设计文件：

{"_id": "_design/RegelingInfo",
    "_rev": "11-20993b8c49d8bcc1cd4fde58e5f40b27",
    "views": {
        "all": {
            "map": "function(doc) { \n  if (doc._id.lastIndexOf('BWB', 0) === 0 ){\n   emit( null, doc._id )\n  }\n}"
        }
    },
    "lists": {},
    "shows": {},
    "language": "javascript", "filters": {}, "updates": {}, "indexes": {
    "regeling": {
        "analyzer": {
            "name": "dutch",
            "stopwords": ["wet", "regeling", "besluit"]
        },
        "index": "function(doc) {\n    var globalString = new Array();\n    index(\"displayTitle\", doc.displayTitle, {\"store\": \"yes\"});\n    globalString.push(doc.displayTitle);\n    /*index(\"officieleTitel\", doc.officieleTitel, {\"store\": \"no\"});*/\n    globalString.push(doc.officieleTitel);\n    /*index(\"bwbid\", doc._id);*/\n    globalString.push(doc._id);\n    index(\"regelingSoort\", doc.regelingSoort, {\"store\": \"no\"});\n    if (doc.citeertitels) {\n        for (var i = 0; i < doc.citeertitels.length; i++) {\n            /*index(\"citeertitel\", doc.citeertitels[i].titel, {\"store\": \"no\"});*/\n            globalString.push(doc.citeertitels[i].titel);\n        }\n    }\n    if (doc.afkortingen) {\n        for (var i = 0; i < doc.afkortingen.length; i++) {\n            /*index(\"afkorting\", doc.afkortingen[i], {\"store\": \"no\"});*/\n            globalString.push(doc.afkortingen[i]);\n        }\n    }\n    if (doc.nietOfficieleTitels) {\n        for (var i = 0; i < doc.nietOfficieleTitels.length; i++) {\n            /*index(\"nietOfficieleTitel\", doc.nietOfficieleTitels[i], {\"store\": \"no\"});*/\n            globalString.push(doc.nietOfficieleTitels[i]);\n        }\n    }\n    if (doc.xml) {\n        /* Remove tags to get inner text*/\n        index(\"innerText\", doc.xml.replace(/<[^>]*>/g, \"\"), {\"store\": \"no\"});\n    }\n    index(\"default\", globalString.join(\" \"), {\"store\": \"no\"});\n}"
    }
}}

格式化索引功能：

function(doc) {
    var globalString = new Array();
    index("displayTitle", doc.displayTitle, {"store": "yes"});
    globalString.push(doc.displayTitle);
    /*index("officieleTitel", doc.officieleTitel, {"store": "no"});*/
    globalString.push(doc.officieleTitel);
    /*index("bwbid", doc._id);*/
    globalString.push(doc._id);
    index("regelingSoort", doc.regelingSoort, {"store": "no"});
    if (doc.citeertitels) {
        for (var i = 0; i < doc.citeertitels.length; i++) {
            /*index("citeertitel", doc.citeertitels[i].titel, {"store": "no"});*/
            globalString.push(doc.citeertitels[i].titel);
        }
    }
    if (doc.afkortingen) {
        for (var i = 0; i < doc.afkortingen.length; i++) {
            /*index("afkorting", doc.afkortingen[i], {"store": "no"});*/
            globalString.push(doc.afkortingen[i]);
        }
    }
    if (doc.nietOfficieleTitels) {
        for (var i = 0; i < doc.nietOfficieleTitels.length; i++) {
            /*index("nietOfficieleTitel", doc.nietOfficieleTitels[i], {"store": "no"});*/
            globalString.push(doc.nietOfficieleTitels[i]);
        }
    }
    if (doc.xml) {
        /* Remove tags to get inner text*/
        index("innerText", doc.xml.replace(/<[^>]*>/g, ""), {"store": "no"});
    }
    index("default", globalString.join(" "), {"store": "no"});
}

Answer 1

您可以看到分析仪的作用;

curl 'http://wetten.cloudant.com/_search_analyze -d '{"analyzer":"dutch","text":"burgerlijke"}'

返回;

{"tokens":["burger"]}

此查询;

curl 'https://wetten.cloudant.com/regelingen/_design/RegelingInfo/_search/regeling?q=burger~'

返回575行。

但是，这有点尴尬，应该为你做。我们会调查一下。

Answer 2

我对荷兰语一无所知，但我强烈怀疑这个问题已经出现了。

与大多数语言特定的分析器一样，DutchAnalyzer包括一个词干分析器，以匹配具有相同词根（即词干）的替代形式的词。但是，不分析通配符，模糊，正则表达式等查询。 TermQueries是。

所以，如果 burgerlijke 在索引中显着受到干扰（似乎已经足够了，不熟悉该语言），很可能是词干vesion和unstemmed版本之间的编辑距离太棒了，看不到比赛。如果索引中的词干表示“汉堡”，则模糊查询词“burgerlijke”的编辑距离为5，这对于获得结果来说太远了。

一般来说，词干分析器对任何MultiTermQuery都不好看。

Answer 3

是的，这都是关于分析的。这是一个有用的（但未记录的）API端点，可帮助调试这些内容。替换您自己的用户名/凭据，但它只是：

curl 'https://malortmike.cloudant.com/_search_analyze?analyzer=dutch&text="burgerlijke"'
{"tokens":["burger"]}

curl -u 'malortmike:secret' 'https://malortmike.cloudant.com/_search_analyze?analyzer=standard&text="burgerlijke"'
{"tokens":["burgerlijke"]}

很高兴看到不同的分析仪在工作。

使用Cloudant / Lucene进行模糊搜索没有结果

编辑：

3 个答案: