Solr的新手。我已经将Solr DIH与TikeEntitiyProcessor一起使用来从本地源D:\ foo \ release中提取数据。我需要编辑要重写为http://的文件路径,我在不同时间都尝试过UpdateHandler和XPathEntityProcessor。它必须在索引提交之前完成。这是我的DIH XML文件。
<dataConfig>
<dataSource type="BinFileDataSource" />
<document>
<entity name="files" dataSource="null" rootEntity="false"
processor="FileListEntityProcessor"
baseDir="d:\normalized\webcontent\bibleforchildren.org" fileName=".*\.(DOC)|(PDF)|(pdf)|(doc)|(docx)|(ppt)|(pptx)|(xls)|(xlsx)|(txt)|(htm)|(html)"
onError="skip"
recursive="true">
<field column="fileAbsolutePath" name="id" />
<field column="fileSize" name="size" />
<field column="fileLastModified" name="lastModified" />
<entity
name="documentImport"
processor="TikaEntityProcessor"
url="${files.fileAbsolutePath}"
format="xml">
<field column="file" name="fileName"/>
<field column="description" name="description" meta="true"/>
<field column="title" name="title" meta="true"/>
<field column="mime_type" name="type" meta="true"/>
<field column="text" name="fulltext"/>
<field column="keywords" name="keywords" meta="true"/>
<entity name="rec" processor="XPathEntityProcessor" url="data.xml" forEach="/xml" dataSource="main" transformer="RegexTransformer" query="select url" from="id"/>
<field column="urls" regex="^[a-z]:\\\w+\\\w+\\" sourceColName="url" />
</entity>
</entity>
</document>
</dataConfig>