问候朋友,
直截了当。我在Mysql DB中存储了很多BLOBS。这些主要是PDF(80%)和.doc。我在DB中也有文字。直到现在我已编入索引,我可以查询文本,但我无法索引BLOBS。我正在尝试制作一个单一的集合(文档) - 但很糟糕。有没有关于如何做这样的事情的食谱?
data-config.xml的一部分:
<?xml version="1.0" encoding="utf-8"?>
<dataConfig>
<dataSource type="JdbcDataSource"
autoCommit="true" batchSize="-1"
convertType="false"
driver="com.mysql.jdbc.Driver"
url="jdbc:mysql://127.0.0.1:3306/ktimatologio"
user="root"
password="********"
name="db"/>
<dataSource name="fieldReader" type="FieldStreamDataSource" />
<document>
<entity name="aitiologikes_ektheseis"
dataSource="db"
transformer="HTMLStripTransformer"
query="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title) AS content from aitiologikes_ektheseis where type = 'text'"
deltaImportQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title) AS content from aitiologikes_ektheseis where type = 'text' and id='${dataimporter.delta.id}'"
deltaQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title) AS content from aitiologikes_ektheseis where type = 'text' and last_modified > '${dataimporter.last_index_time}'">
<field column="id" name="ida" />
<field column="solr_id" name="solr_id" />
<field column="title" name="title" stripHTML="true" />
<field column="grid_title" name="grid_title" stripHTML="true" />
<field column="model" name="model" stripHTML="true" />
<field column="type" name="type" stripHTML="true" />
<field column="url" name="url" stripHTML="true" />
<field column="last_modified" name="last_modified" stripHTML="true" />
<field column="search_tag" name="search_tag" stripHTML="true" />
<field column="content" name="content" stripHTML="true" />
</entity>
<entity name="aitiologikes_ektheseis_bin"
query="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin'"
deltaImportQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin' and id='${dataimporter.delta.id}'"
deltaQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin' and last_modified > '${dataimporter.last_index_time}'"
transformer="TemplateTransformer"
dataSource="db">
<field column="id" name="ida" />
<field column="solr_id" name="solr_id" />
<field column="title" name="title" stripHTML="true" />
<field column="grid_title" name="grid_title" stripHTML="true" />
<field column="model" name="model" stripHTML="true" />
<field column="type" name="type" stripHTML="true" />
<field column="url" name="url" stripHTML="true" />
<field column="last_modified" name="last_modified" stripHTML="true" />
<field column="search_tag" name="search_tag" stripHTML="true" />
<entity dataSource="fieldReader" processor="TikaEntityProcessor" dataField="aitiologikes_ektheseis_bin.text" format="text">
<field column="text" name="contentbin" stripHTML="true" />
</entity>
</entity>
...
...
</document>
</dataConfig>
schema.xml中的一部分(fieldTypes和字段定义):
<fieldType name="text_ktimatologio" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
<filter class="solr.GreekLowerCaseFilterFactory"/>
<filter class="solr.GreekStemFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
<analyzer type="query">
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
<filter class="solr.GreekLowerCaseFilterFactory"/>
<filter class="solr.GreekStemFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.EnglishPossessiveFilterFactory"/>
<filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
<filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>
<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
<filter class="solr.GreekLowerCaseFilterFactory"/>
<filter class="solr.GreekStemFilterFactory"/>
<filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />
</analyzer>
<analyzer type="query">
<charFilter class="solr.HTMLStripCharFilterFactory"/>
<tokenizer class="solr.StandardTokenizerFactory"/>
<filter class="solr.StandardFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.LowerCaseFilterFactory"/>
<filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
<filter class="solr.GreekLowerCaseFilterFactory"/>
<filter class="solr.GreekStemFilterFactory"/>
<filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />
</analyzer>
</fieldType>
<fields>
<field name="ida" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="solr_id" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="title" type="text_ktimatologio" indexed="true" stored="true"/>
<field name="grid_title" type="text_ktimatologio" indexed="true" stored="true"/>
<field name="model" type="string" indexed="true" stored="true" multiValued="false"/>
<field name="type" type="string" indexed="true" stored="true"/>
<field name="url" type="string" indexed="true" stored="true"/>
<field name="last_modified" type="string" indexed="true" stored="true"/>
<field name="search_tag" type="string" indexed="true" stored="true"/>
<field name="contentbin" type="text" indexed="true" stored="true" multiValued="true"/>
<field name="content" type="text_ktimatologio" indexed="true" stored="true" multiValued="true"/>
</fields>
我真的需要帮助!
尊重,
汤姆
希腊
答案 0 :(得分:0)
你想“索引”一个BLOB吗?意思是你想最终能够搜索它?我不确定我是否理解你的问题。
我猜你想先用Apache Tika in Solr之类的东西转换你的PDF或.doc,然后让Solr为你索引。此外,如果您希望让您的用户访问PDF或doc,那么最好的位置是DB并从那里检索它?