Solr

时间:2017-12-11 13:55:33

标签: solr solrcloud

我有两种语言的数据,英语和韩语,我已经为英语索引了数据,我需要索引韩语的数据。我做了一些研究,发现有少量语言的内置支持,但我无法在那里明确地找到韩语,就像我可以看到其他语言,例如德国人,法国人等。我坚持如何为韩语做这件事。

我尝试在字段1中使用CJK tokenizer,它是模式中的text_general,所以我创建了一个副本并将其作为text_general_cjk放入但是我将错误视为无效unknown_field_type fieldname text_general_cjk

下面是我的架构,我只需要更新asr_hypothesis,nlg_output,nlu_utterance,文件可以包含任何两种语言的数据。所以模式应该能够相应地检测特定的语言和索引

<?xml version="1.0" encoding="UTF-8" ?>

<schema name="default-config" version="1.6">

<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<!-- docValues are enabled by default for long type so we don't need to index the version field  -->
<field name="_version_" type="plong" indexed="false" stored="false"/>
<field name="_root_" type="string" indexed="true" stored="false" docValues="false" />
<field name="_text_" type="text_general" indexed="true" stored="false" multiValued="true"/>
<field name="sid" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="model_id" type="strings" indexed="true" stored="true" multiValued="false" default=" "/>
<field name="language_code" type="strings" indexed="true" stored="true" multiValued="false" default=" "/>
<field name="country_code" type="strings" indexed="true" stored="true" multiValued="false" default=" "/>
<field name="client_datetime" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="bixby_version" type="strings" indexed="true" stored="true" multiValued="false" default=" "/>
<field name="resource_flag" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="command_mode_04" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="command_mode_08" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="utterance_type" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="output_method" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="audio_length" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="asr_hypothesis" type="text_general" indexed="true" stored="true" multiValued="false" default=" "/>
<field name="asr_silence" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="agent" type="strings" indexed="true" stored="true" multiValued="false" default=" "/>
<field name="command_name" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="screen_states" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="rule_id" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="is_root" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="app_list" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="execute_app" type="text_general" indexed="true" stored="true" multiValued="false" default=" "/>
<field name="event_1010_rule_id" type="text_general" indexed="true" stored="true" multiValued="false" default=" "/>
<field name="is_complete_generation_time" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="is_complete" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="landing_type" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="nlg_output" type="text_general" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="thumbs_result" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="close_type" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="event_22" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="chatbot_resp_id" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="nlu_utterance" type="text_general" indexed="true" stored="true" multiValued="false" default=" "/>
<field name="nlu_matched_domain" type="text_general" indexed="true" stored="true" multiValued="false" default=" "/>
<field name="nlu_display_text" type="text_general" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="nlg_display_text" type="text_general" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="dc_agent" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="nlu_bixby_state_ids" type="text_general" indexed="true" stored="true" multiValued="false" default=" "/>
<field name="user_type" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="rule_chooser_result" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="fe_client_time" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="command_type" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="completeness" type="text_general" indexed="true" stored="true" multiValued="false" default=" "/>
<field name="fr_om" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="event_28" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="event_29" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="event_31" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="event_32" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="event_33" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="nlu_open_qa_session_id" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="nlu_is_open_qa_session" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="nlu_viv_capsule" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="nlu_viv_goal" type="strings" indexed="false" stored="true" multiValued="false" default=" "/>
<field name="yyyymmdd" type="strings" indexed="true" stored="true" multiValued="false" default=" "/>   

1 个答案:

答案 0 :(得分:0)

仅将cjk附加到您的字段类型是不够的,因此它会神奇地开始工作:

您需要在架构中指定名称为text_general_cjk的fieldType。下面是一个非常简单的示例,您应该考虑到您的需求进行扩展:

    <fieldType name="text_general_cjk" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index">
        <tokenizer class="solr.ICUTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
      <analyzer type="query">
        <tokenizer class="solr.ICUTokenizerFactory"/>
        <filter class="solr.LowerCaseFilterFactory"/>
      </analyzer>
    </fieldType>

它只是使用特殊的ICUTokenizer,它也适用于CJK语言。您可以根据需要添加更多内容,列表为there(同时查看CJK细节)

在此之后,您可以添加字段:

<field name="text_cjk" type="text_general_cjk" indexed="true" stored="false"/>

只有在此之后,您才能使用此字段索引文档。不要忘记在模式中进行以下更改后需要重新启动Solr并重新索引。

由于ICU过滤器不是默认Solr库的一部分,您需要将其附加到solrconfig.xml lucene-analyzers-icu jar