索引xml中的每个文档字段分别作为单独的文档

时间:2017-02-08 09:09:23

标签: xml apache indexing solr

我正在尝试使用以下格式索引具有1400个数据文档的xml(每个“信息”字段的开头和结尾都是文档的单独实体。

<add>
<information>
    <id>a1a</id>
    <author>abcd</author>
    <bibliography>a. b. ,c</bibliography>
    <body>This sample one.</body>
    <title>Sample one</title>
</information>
<information>
    <id>a2a</id>
    <author>xyz</author>
    <bibliography>x. y.x</bibliography>
    <body>This is sample two</body>
    <title>Sample Two</title>
</information>
</add>

我使用命令提示符使用post命令提交文件,java -Durl = http://localhost:8983/solr/update/extract?literal.id=VR -Dtype = application / xml -jar post.jar VR.xml

Schema.xml(直到字段声明)

 <field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" /> 

   <field name="bibliography" type="string" indexed="true" stored="true"/>
   <field name="body" type="text_en" indexed="true" stored="true"/>

   <field name="sku" type="text_en_splitting_tight" indexed="true" stored="true" omitNorms="true"/>
   <field name="name" type="text_general" indexed="true" stored="true"/>
   <field name="manu" type="text_general" indexed="true" stored="true" omitNorms="true"/>
   <field name="cat" type="string" indexed="true" stored="true" multiValued="true"/>
   <field name="features" type="text_general" indexed="true" stored="true" multiValued="true"/>
   <field name="includes" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />

   <field name="weight" type="float" indexed="true" stored="true"/>
   <field name="price"  type="float" indexed="true" stored="true"/>
   <field name="popularity" type="int" indexed="true" stored="true" />
   <field name="inStock" type="boolean" indexed="true" stored="true" />

   <field name="store" type="location" indexed="true" stored="true"/>

   <field name="title" type="text_general" indexed="true" stored="true" multiValued="true"/>
   <field name="subject" type="text_general" indexed="true" stored="true"/>
   <field name="description" type="text_general" indexed="true" stored="true"/>
   <field name="comments" type="text_general" indexed="true" stored="true"/>
   <field name="author" type="text_general" indexed="true" stored="true"/>
   <field name="keywords" type="text_general" indexed="true" stored="true"/>
   <field name="category" type="text_general" indexed="true" stored="true"/>
   <field name="resourcename" type="text_general" indexed="true" stored="true"/>
   <field name="url" type="text_general" indexed="true" stored="true"/>
   <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
   <field name="last_modified" type="date" indexed="true" stored="true"/>
   <field name="links" type="string" indexed="true" stored="true" multiValued="true"/>


   <field name="content" type="text_general" indexed="false" stored="true" multiValued="true"/>


   <!-- catchall field, containing all other searchable text fields (implemented
        via copyField further on in this schema  -->
   <field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>

   <!-- catchall text field that indexes tokens both normally and in reverse for efficient
        leading wildcard queries. -->
   <field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>

   <!-- non-tokenized version of manufacturer to make it easier to sort or group
        results by manufacturer.  copied from "manu" via copyField -->
   <field name="manu_exact" type="string" indexed="true" stored="false"/>

   <field name="payloads" type="payloads" indexed="true" stored="true"/>



   <dynamicField name="*_i"  type="int"    indexed="true"  stored="true"/>
   <dynamicField name="*_is" type="int"    indexed="true"  stored="true"  multiValued="true"/>
   <dynamicField name="*_s"  type="string"  indexed="true"  stored="true" />
   <dynamicField name="*_ss" type="string"  indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
   <dynamicField name="*_ls" type="long"   indexed="true"  stored="true"  multiValued="true"/>
   <dynamicField name="*_t"  type="text_general"    indexed="true"  stored="true"/>
   <dynamicField name="*_txt" type="text_general"   indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_en"  type="text_en"    indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_b"  type="boolean" indexed="true" stored="true"/>
   <dynamicField name="*_bs" type="boolean" indexed="true" stored="true"  multiValued="true"/>
   <dynamicField name="*_f"  type="float"  indexed="true"  stored="true"/>
   <dynamicField name="*_fs" type="float"  indexed="true"  stored="true"  multiValued="true"/>
   <dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>
   <dynamicField name="*_ds" type="double" indexed="true"  stored="true"  multiValued="true"/>

   <!-- Type used to index the lat and lon components for the "location" FieldType -->
   <dynamicField name="*_coordinate"  type="tdouble" indexed="true"  stored="false" />

   <dynamicField name="*_dt"  type="date"    indexed="true"  stored="true"/>
   <dynamicField name="*_dts" type="date"    indexed="true"  stored="true" multiValued="true"/>
   <dynamicField name="*_p"  type="location" indexed="true" stored="true"/>

   <!-- some trie-coded dynamic fields for faster range queries -->
   <dynamicField name="*_ti" type="tint"    indexed="true"  stored="true"/>
   <dynamicField name="*_tl" type="tlong"   indexed="true"  stored="true"/>
   <dynamicField name="*_tf" type="tfloat"  indexed="true"  stored="true"/>
   <dynamicField name="*_td" type="tdouble" indexed="true"  stored="true"/>
   <dynamicField name="*_tdt" type="tdate"  indexed="true"  stored="true"/>

   <dynamicField name="*_c"   type="currency" indexed="true"  stored="true"/>

   <dynamicField name="ignored_*" type="ignored" multiValued="true"/>
   <dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>

   <dynamicField name="random_*" type="random" />

 <uniqueKey>id</uniqueKey>

所以,在这里我添加了xml中提到的所有额外字段和默认配置。

索引编制完成后,如果我在此处输入查询,则给出的响应是

{         “id”:“VR”,         “内容类型”: [           “应用程序/ xml” 的         ]         “内容”:[           “\ n \ n \ n \ n \ n \ n \ n \ n \ a aba \ n ab,c \ n此样本。\ n \ n \ n \ n \ n \ n \ n \ aaa \ n xyz \ n xyx \ n这是样本二\ n样本二\ n \ n \ n“         ]         “ version ”:1558754454898999300

预期的反应是:

{         “id”:“abcd”,         “内容类型”: [           “应用程序/ xml” 的         ]         “内容”:[           “\ n \ n \ n \ n \ n \ n \ n \ n \ a \ aaa \ n abcd \ n a。b。,c \ n此样本。\ n样本1 \ n \ n \ n a2a \ n”         ],

    "_version_": 1558754454898999300

{         “id”:“xyz”,         “内容类型”: [           “应用程序/ xml” 的         ]         “内容”:[           “xyz \ n x。y.x \ n这是样本2 \ n样本2 \ n \ n \ n”         ]         “ version ”:1558754454898999301

请告诉我,我遗失的地方,如果需要任何额外信息,请随时提出所需的建议。

3 个答案:

答案 0 :(得分:0)

我认为,所有内容都被编入索引为一个文档。

<information></information>代码更改为<doc></doc>

<add>
<doc>
    <id>a1a</id>
    <author>abcd</author>
    <bibliography>a. b. ,c</bibliography>
    <body>This sample one.</body>
    <title>Sample one</title>
</doc>
<doc>
    <id>a2a</id>
    <author>xyz</author>
    <bibliography>x. y.x</bibliography>
    <body>This is sample two</body>
    <title>Sample Two</title>
</doc>
</add>

答案 1 :(得分:0)

将xml转换为以下格式:

<add>
  <doc>
    <field name="id">a1a</field>
    <field name="author">abcd</field>
    <field name="bibliography">a. b. ,c</field>
    <field name="body">This sample one.</field>
  </doc>

  <doc>
    <field name="id">a2a</field>
    <field name="author">xyz</field>
    <field name="bibliography">x. y. ,z</field>
    <field name="body">Sample Two.</field>
  </doc>
</add>

现在您可以使用以下命令发布数据:

/opt/solr/bin/post -c collection_name VR.xml

答案 2 :(得分:0)

Ashraful的答案中的XML文档是正确的:

<add>
  <doc>
    <field name="id">a1a</field>
    <field name="author">abcd</field>
    <field name="bibliography">a. b. ,c</field>
    <field name="body">This sample one.</field>
  </doc>

  <doc>
    <field name="id">a2a</field>
    <field name="author">xyz</field>
    <field name="bibliography">x. y. ,z</field>
    <field name="body">Sample Two.</field>
  </doc>
</add>

ExtractingRequestHandler literal.id=VR的帖子id使用VR java -jar post.jar VR.xml创建/更新单个文档。

只需发布为 "docs": [ { "id": "a1a", "author": "abcd", "author_s": "abcd", "bibliography": "a. b. ,c", "body": "This sample one.", "_version_": 1558910648654495700 }, { "id": "a2a", "author": "xyz", "author_s": "xyz", "bibliography": "x. y. ,z", "body": "Sample Two.", "_version_": 1558910648656593000 } 即可获得以下结果:

SELECT make FROM cars
GROUP BY make
HAVING COUNT(type) > 3