索引和查询存储在Mysql中的BLOBS

时间:2012-08-23 18:50:29

标签: mysql solr indexing blob

问候朋友,

直截了当。我在Mysql DB中存储了很多BLOBS。这些主要是PDF(80%)和.doc。我在DB中也有文字。直到现在我已编入索引,我可以查询文本,但我无法索引BLOBS。我正在尝试制作一个单一的集合(文档) - 但很糟糕。有没有关于如何做这样的事情的食谱?

data-config.xml的一部分:

<?xml version="1.0" encoding="utf-8"?>

<dataConfig>

<dataSource type="JdbcDataSource"
  autoCommit="true" batchSize="-1"
  convertType="false"
  driver="com.mysql.jdbc.Driver"
  url="jdbc:mysql://127.0.0.1:3306/ktimatologio"
  user="root"
  password="********"
  name="db"/>

<dataSource name="fieldReader" type="FieldStreamDataSource" />  

  <document> 

    <entity name="aitiologikes_ektheseis"
    dataSource="db"
    transformer="HTMLStripTransformer"
    query="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text'"
    deltaImportQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text' and id='${dataimporter.delta.id}'"
    deltaQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, CONCAT( body,' ',title)  AS content from aitiologikes_ektheseis where type = 'text' and last_modified &gt; '${dataimporter.last_index_time}'">
      <field column="id" name="ida" />               
      <field column="solr_id" name="solr_id" />
      <field column="title" name="title" stripHTML="true" />
      <field column="grid_title" name="grid_title" stripHTML="true" />
      <field column="model" name="model" stripHTML="true" />
      <field column="type" name="type" stripHTML="true" />
      <field column="url" name="url" stripHTML="true" />
      <field column="last_modified" name="last_modified" stripHTML="true"  />
      <field column="search_tag" name="search_tag" stripHTML="true" />
      <field column="content" name="content" stripHTML="true" />
    </entity>

    <entity name="aitiologikes_ektheseis_bin"
    query="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin'"
    deltaImportQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin' and id='${dataimporter.delta.id}'"
    deltaQuery="select id, title, title AS grid_title, model, type, url, last_modified, CONCAT_WS('_',id,model) AS solr_id, search_tag, bin_con AS text from aitiologikes_ektheseis where type = 'bin' and last_modified &gt; '${dataimporter.last_index_time}'"
    transformer="TemplateTransformer"
    dataSource="db">         
      <field column="id" name="ida" />               
      <field column="solr_id" name="solr_id" />
      <field column="title" name="title" stripHTML="true" />
      <field column="grid_title" name="grid_title" stripHTML="true" />
      <field column="model" name="model" stripHTML="true" />
      <field column="type" name="type" stripHTML="true" />
      <field column="url" name="url" stripHTML="true" />
      <field column="last_modified" name="last_modified" stripHTML="true"  />
      <field column="search_tag" name="search_tag" stripHTML="true" />

      <entity dataSource="fieldReader" processor="TikaEntityProcessor" dataField="aitiologikes_ektheseis_bin.text" format="text"> 
          <field column="text" name="contentbin" stripHTML="true" />
      </entity>

    </entity>

       ...
       ...
  </document> 

</dataConfig>

schema.xml中的一部分(fieldTypes和字段定义):

<fieldType name="text_ktimatologio" class="solr.TextField" positionIncrementGap="100">

<analyzer type="index">               
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true"/>
  <filter class="solr.LowerCaseFilterFactory"/>
  <filter class="solr.EnglishPossessiveFilterFactory"/>               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>       
  <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
  <filter class="solr.PorterStemFilterFactory"/>
</analyzer> 

<analyzer type="query">
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt" enablePositionIncrements="true"/>               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>               
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>
  <filter class="solr.LowerCaseFilterFactory"/>
  <filter class="solr.EnglishPossessiveFilterFactory"/>
  <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
  <filter class="solr.PorterStemFilterFactory"/>
</analyzer>
</fieldType>

<fieldType name="text" class="solr.TextField" positionIncrementGap="100">
<analyzer type="index">
  <charFilter class="solr.HTMLStripCharFilterFactory"/>
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.StandardFilterFactory"/>   
  <filter class="solr.LowerCaseFilterFactory"/>                               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>               
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>
  <filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />
</analyzer>

<analyzer type="query">
  <charFilter class="solr.HTMLStripCharFilterFactory"/>
  <tokenizer class="solr.StandardTokenizerFactory"/>
  <filter class="solr.StandardFilterFactory"/>
  <filter class="solr.LowerCaseFilterFactory"/>   
  <filter class="solr.LowerCaseFilterFactory"/>                               
  <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_el.txt" enablePositionIncrements="true"/>               
  <filter class="solr.GreekLowerCaseFilterFactory"/>
  <filter class="solr.GreekStemFilterFactory"/>
  <filter class="solr.HunspellStemFilterFactory" dictionary="dictionaries/el_GR.dic" affix="dictionaries/el_GR.aff" ignoreCase="true" />
</analyzer>
</fieldType>

<fields>
  <field  name="ida" type="string" indexed="true" stored="true" multiValued="false"/>
  <field  name="solr_id" type="string" indexed="true" stored="true" multiValued="false"/> 
  <field  name="title" type="text_ktimatologio" indexed="true" stored="true"/>
  <field  name="grid_title" type="text_ktimatologio" indexed="true" stored="true"/>
  <field  name="model" type="string" indexed="true" stored="true" multiValued="false"/>
  <field  name="type" type="string" indexed="true" stored="true"/>
  <field  name="url" type="string" indexed="true" stored="true"/>
  <field  name="last_modified" type="string" indexed="true" stored="true"/>
  <field  name="search_tag" type="string" indexed="true" stored="true"/>
  <field  name="contentbin" type="text" indexed="true" stored="true" multiValued="true"/>
  <field  name="content" type="text_ktimatologio" indexed="true" stored="true" multiValued="true"/>     
</fields>

我真的需要帮助!

尊重,

汤姆

希腊

1 个答案:

答案 0 :(得分:0)

你想“索引”一个BLOB吗?意思是你想最终能够搜索它?我不确定我是否理解你的问题。

我猜你想先用Apache Tika in Solr之类的东西转换你的PDF或.doc,然后让Solr为你索引。此外,如果您希望让您的用户访问PDF或doc,那么最好的位置是DB并从那里检索它?