我正在尝试使用以下格式索引具有1400个数据文档的xml(每个“信息”字段的开头和结尾都是文档的单独实体。
<add>
<information>
<id>a1a</id>
<author>abcd</author>
<bibliography>a. b. ,c</bibliography>
<body>This sample one.</body>
<title>Sample one</title>
</information>
<information>
<id>a2a</id>
<author>xyz</author>
<bibliography>x. y.x</bibliography>
<body>This is sample two</body>
<title>Sample Two</title>
</information>
</add>
我使用命令提示符使用post命令提交文件,java -Durl = http://localhost:8983/solr/update/extract?literal.id=VR -Dtype = application / xml -jar post.jar VR.xml
Schema.xml(直到字段声明)
<field name="id" type="string" indexed="true" stored="true" required="true" multiValued="false" />
<field name="bibliography" type="string" indexed="true" stored="true"/>
<field name="body" type="text_en" indexed="true" stored="true"/>
<field name="sku" type="text_en_splitting_tight" indexed="true" stored="true" omitNorms="true"/>
<field name="name" type="text_general" indexed="true" stored="true"/>
<field name="manu" type="text_general" indexed="true" stored="true" omitNorms="true"/>
<field name="cat" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="features" type="text_general" indexed="true" stored="true" multiValued="true"/>
<field name="includes" type="text_general" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" />
<field name="weight" type="float" indexed="true" stored="true"/>
<field name="price" type="float" indexed="true" stored="true"/>
<field name="popularity" type="int" indexed="true" stored="true" />
<field name="inStock" type="boolean" indexed="true" stored="true" />
<field name="store" type="location" indexed="true" stored="true"/>
<field name="title" type="text_general" indexed="true" stored="true" multiValued="true"/>
<field name="subject" type="text_general" indexed="true" stored="true"/>
<field name="description" type="text_general" indexed="true" stored="true"/>
<field name="comments" type="text_general" indexed="true" stored="true"/>
<field name="author" type="text_general" indexed="true" stored="true"/>
<field name="keywords" type="text_general" indexed="true" stored="true"/>
<field name="category" type="text_general" indexed="true" stored="true"/>
<field name="resourcename" type="text_general" indexed="true" stored="true"/>
<field name="url" type="text_general" indexed="true" stored="true"/>
<field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="last_modified" type="date" indexed="true" stored="true"/>
<field name="links" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="content" type="text_general" indexed="false" stored="true" multiValued="true"/>
<!-- catchall field, containing all other searchable text fields (implemented
via copyField further on in this schema -->
<field name="text" type="text_general" indexed="true" stored="false" multiValued="true"/>
<!-- catchall text field that indexes tokens both normally and in reverse for efficient
leading wildcard queries. -->
<field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>
<!-- non-tokenized version of manufacturer to make it easier to sort or group
results by manufacturer. copied from "manu" via copyField -->
<field name="manu_exact" type="string" indexed="true" stored="false"/>
<field name="payloads" type="payloads" indexed="true" stored="true"/>
<dynamicField name="*_i" type="int" indexed="true" stored="true"/>
<dynamicField name="*_is" type="int" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_s" type="string" indexed="true" stored="true" />
<dynamicField name="*_ss" type="string" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
<dynamicField name="*_ls" type="long" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_t" type="text_general" indexed="true" stored="true"/>
<dynamicField name="*_txt" type="text_general" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_en" type="text_en" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_b" type="boolean" indexed="true" stored="true"/>
<dynamicField name="*_bs" type="boolean" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_f" type="float" indexed="true" stored="true"/>
<dynamicField name="*_fs" type="float" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_d" type="double" indexed="true" stored="true"/>
<dynamicField name="*_ds" type="double" indexed="true" stored="true" multiValued="true"/>
<!-- Type used to index the lat and lon components for the "location" FieldType -->
<dynamicField name="*_coordinate" type="tdouble" indexed="true" stored="false" />
<dynamicField name="*_dt" type="date" indexed="true" stored="true"/>
<dynamicField name="*_dts" type="date" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="*_p" type="location" indexed="true" stored="true"/>
<!-- some trie-coded dynamic fields for faster range queries -->
<dynamicField name="*_ti" type="tint" indexed="true" stored="true"/>
<dynamicField name="*_tl" type="tlong" indexed="true" stored="true"/>
<dynamicField name="*_tf" type="tfloat" indexed="true" stored="true"/>
<dynamicField name="*_td" type="tdouble" indexed="true" stored="true"/>
<dynamicField name="*_tdt" type="tdate" indexed="true" stored="true"/>
<dynamicField name="*_c" type="currency" indexed="true" stored="true"/>
<dynamicField name="ignored_*" type="ignored" multiValued="true"/>
<dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>
<dynamicField name="random_*" type="random" />
<uniqueKey>id</uniqueKey>
所以,在这里我添加了xml中提到的所有额外字段和默认配置。
索引编制完成后,如果我在此处输入查询,则给出的响应是
{ “id”:“VR”, “内容类型”: [ “应用程序/ xml” 的 ] “内容”:[ “\ n \ n \ n \ n \ n \ n \ n \ n \ a aba \ n ab,c \ n此样本。\ n \ n \ n \ n \ n \ n \ n \ aaa \ n xyz \ n xyx \ n这是样本二\ n样本二\ n \ n \ n“ ] “ version ”:1558754454898999300
预期的反应是:
{ “id”:“abcd”, “内容类型”: [ “应用程序/ xml” 的 ] “内容”:[ “\ n \ n \ n \ n \ n \ n \ n \ n \ a \ aaa \ n abcd \ n a。b。,c \ n此样本。\ n样本1 \ n \ n \ n a2a \ n” ],
"_version_": 1558754454898999300
{ “id”:“xyz”, “内容类型”: [ “应用程序/ xml” 的 ] “内容”:[ “xyz \ n x。y.x \ n这是样本2 \ n样本2 \ n \ n \ n” ] “ version ”:1558754454898999301
请告诉我,我遗失的地方,如果需要任何额外信息,请随时提出所需的建议。
答案 0 :(得分:0)
我认为,所有内容都被编入索引为一个文档。
将<information></information>
代码更改为<doc></doc>
<add>
<doc>
<id>a1a</id>
<author>abcd</author>
<bibliography>a. b. ,c</bibliography>
<body>This sample one.</body>
<title>Sample one</title>
</doc>
<doc>
<id>a2a</id>
<author>xyz</author>
<bibliography>x. y.x</bibliography>
<body>This is sample two</body>
<title>Sample Two</title>
</doc>
</add>
答案 1 :(得分:0)
将xml转换为以下格式:
<add>
<doc>
<field name="id">a1a</field>
<field name="author">abcd</field>
<field name="bibliography">a. b. ,c</field>
<field name="body">This sample one.</field>
</doc>
<doc>
<field name="id">a2a</field>
<field name="author">xyz</field>
<field name="bibliography">x. y. ,z</field>
<field name="body">Sample Two.</field>
</doc>
</add>
现在您可以使用以下命令发布数据:
/opt/solr/bin/post -c collection_name VR.xml
答案 2 :(得分:0)
Ashraful的答案中的XML文档是正确的:
<add>
<doc>
<field name="id">a1a</field>
<field name="author">abcd</field>
<field name="bibliography">a. b. ,c</field>
<field name="body">This sample one.</field>
</doc>
<doc>
<field name="id">a2a</field>
<field name="author">xyz</field>
<field name="bibliography">x. y. ,z</field>
<field name="body">Sample Two.</field>
</doc>
</add>
ExtractingRequestHandler
literal.id=VR
的帖子id
使用VR
java -jar post.jar VR.xml
创建/更新单个文档。
只需发布为 "docs": [
{
"id": "a1a",
"author": "abcd",
"author_s": "abcd",
"bibliography": "a. b. ,c",
"body": "This sample one.",
"_version_": 1558910648654495700
},
{
"id": "a2a",
"author": "xyz",
"author_s": "xyz",
"bibliography": "x. y. ,z",
"body": "Sample Two.",
"_version_": 1558910648656593000
}
即可获得以下结果:
SELECT make FROM cars
GROUP BY make
HAVING COUNT(type) > 3