我正在尝试在Solr(SolrCloud)7.3中使用NER,但我有点遗漏了一些文档。
我找到了什么:
我做了什么:
solrconfig.xml的部分
<updateRequestProcessorChain name="multiple-extract">
<processor class="solr.OpenNLPExtractNamedEntitiesUpdateProcessorFactory">
<str name="modelFile">opennlp/en-ner-person.bin</str>
<str name="analyzerFieldType">text_opennlp</str>
<str name="source">description_en</str>
<str name="dest">content</str>
</processor>
<processor class="solr.LogUpdateProcessorFactory" />
<processor class="solr.RunUpdateProcessorFactory" />
</updateRequestProcessorChain>
<requestHandler name="/myupdate " class="solr.UpdateRequestHandler">
<lst name="defaults">
<str name="update.chain">multiple-extract</str>
</lst>
</requestHandler>
管理架构的部分
<field name="_root_" type="string" docValues="false" indexed="true" stored="false"/>
<field name="_text_" type="text_general" multiValued="true" indexed="true" stored="false"/>
<field name="_version_" type="plong" indexed="false" stored="false"/>
<field name="content" type="text_opennlp" indexed="true" termOffsets="true" stored="true" termPayloads="true" termPositions="true" docValues="false" termVectors="true" multiValued="true" required="true"/>
<field name="content_pos" type="text_opennlp_pos" indexed="true" termOffsets="true" stored="true" termPayloads="true" termPositions="true" docValues="false" termVectors="true" multiValued="true" required="true"/>
<field name="description_en" type="text_en" indexed="true" stored="true"/>
<field name="id" type="string" multiValued="false" indexed="true" required="true" stored="true"/>
<fieldType name="text_opennlp" class="solr.TextField">
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory" tokenizerModel="opennlp/en-token.bin" sentenceModel="opennlp/en-sent.bin"/>
</analyzer>
</fieldType>
<fieldType name="text_opennlp_pos" class="solr.TextField">
<analyzer>
<tokenizer class="solr.OpenNLPTokenizerFactory" tokenizerModel="opennlp/en-token.bin" sentenceModel="opennlp/en-sent.bin"/>
<filter class="solr.OpenNLPPOSFilterFactory" posTaggerModel="opennlp/en-pos-maxent.bin"/>
<filter class="solr.TypeAsPayloadFilterFactory"/>
</analyzer>
</fieldType>
我的假设
正如我假设我将使用多提取更新处理器链发送数据时,它将从<str name="source">description_en</str>
中提取并放入
<str name="dest">content</str>
。但我甚至都看不到这种行为。
我的测试
数据插入请求
POST http://localhost:8983/solr/numberplate/update?version=2.2&wt=xml&update.chain=multiple-extract
<add><doc><field name="id">0e5c7067-9cf0-445b-8374-4bf25484420c</field><field name="description_en">This is Steve Jobs 4 </field><field name="content_pos">This is text 4</field><field name="content"></field></doc></add>
数据插入的响应
<?xml version="1.0" encoding="UTF-8"?>
<response>
<lst name="responseHeader">
<int name="status">0</int>
<int name="QTime">20</int>
</lst>
</response>
请求数据选择
http://localhost:8983/solr/numberplate/select?q= :
数据选择的响应
{
"responseHeader": {
"zkConnected": true,
"status": 0,
"QTime": 22,
"params": {
"q": "*:*"
}
},
"response": {
"numFound": 5,
"start": 0,
"maxScore": 1,
"docs": [
{
"id": "0e5c7067-9cf0-445b-8374-4bf25484420c",
"description_en": "This is Steve Jobs 4 ",
"content_pos": [
"This is text 4"
],
"content": [
""
],
"_version_": 1598004417210089500
}
]
}
}
问题
我做错了什么?也许Solr 7.3中的NER以其他一些神奇的方式工作,或者我不能正确理解某些东西?