使用Solr改进格式/文本识别

时间:2014-10-08 17:58:22

标签: php solr apache-tika

我想知道是否有办法清理Solr提供的结果格式(特别是文本字段)。许多文档都被编入索引,当被要求提供重点时,Solr在大多数情况下都会返回相当清晰的结果。不幸的是,有很多结果似乎都是由质量很差的文本/亮点组成。

我附上了3张图片。它们都来自同一份文件。忽略黑条,我必须在发布之前编辑一些敏感信息。第一个是PDF如何正常显示。第二个说明了当查询Solr关于此文档时返回的突出显示问题。第三张图片来自主要文本字段,还显示所有文本都被咀嚼起来。这就是它在原始XML中的显示方式。

在索引编制期间或查询期间是否可以执行任何操作以改进文本识别或格式化?

根据要求,以下是schema.xml的相关位:

                                     

<fieldType name="tint" class="solr.TrieIntField" precisionStep="8" positionIncrementGap="0"/>
<fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" positionIncrementGap="0"/>
<fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" positionIncrementGap="0"/>
<fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" positionIncrementGap="0"/>

<fieldType name="date" class="solr.TrieDateField" precisionStep="0" positionIncrementGap="0"/>
<fieldType name="tdate" class="solr.TrieDateField" precisionStep="6" positionIncrementGap="0"/>

<fieldType name="pint" class="solr.IntField"/>
<fieldType name="plong" class="solr.LongField"/>
<fieldType name="pfloat" class="solr.FloatField"/>
<fieldType name="pdouble" class="solr.DoubleField"/>
<fieldType name="pdate" class="solr.DateField" sortMissingLast="true"/>

<fieldType name="sint" class="solr.SortableIntField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="slong" class="solr.SortableLongField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="sfloat" class="solr.SortableFloatField" sortMissingLast="true" omitNorms="true"/>
<fieldType name="sdouble" class="solr.SortableDoubleField" sortMissingLast="true" omitNorms="true"/>

<fieldType name="random" class="solr.RandomSortField" indexed="true" />

<fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100">
  <analyzer>
    <tokenizer class="solr.WhitespaceTokenizerFactory"/>
  </analyzer>
</fieldType>

<fieldType name="text_general" class="solr.TextField" positionIncrementGap="100">
  <analyzer type="index">
    <tokenizer class="solr.StandardTokenizerFactory"/>
    <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
    <filter class="solr.LowerCaseFilterFactory"/>
  </analyzer>
  <analyzer type="query">
    <tokenizer class="solr.StandardTokenizerFactory"/>
    <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
    <filter class="solr.LowerCaseFilterFactory"/>
  </analyzer>
</fieldType>

<fieldType name="text_en" class="solr.TextField" positionIncrementGap="100">
  <analyzer type="index">
    <tokenizer class="solr.StandardTokenizerFactory"/>
    <filter class="solr.StopFilterFactory"
            ignoreCase="true"
            words="lang/stopwords_en.txt"
            enablePositionIncrements="true"
            />
    <filter class="solr.LowerCaseFilterFactory"/>
    <filter class="solr.EnglishPossessiveFilterFactory"/>
    <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
    <filter class="solr.PorterStemFilterFactory"/>
  </analyzer>
  <analyzer type="query">
    <tokenizer class="solr.StandardTokenizerFactory"/>
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
    <filter class="solr.StopFilterFactory"
            ignoreCase="true"
            words="lang/stopwords_en.txt"
            enablePositionIncrements="true"
            />
    <filter class="solr.LowerCaseFilterFactory"/>
    <filter class="solr.EnglishPossessiveFilterFactory"/>
    <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
    <filter class="solr.PorterStemFilterFactory"/>
  </analyzer>
</fieldType>

<fieldType name="text_en_splitting" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
  <analyzer type="index">
    <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    <filter class="solr.StopFilterFactory"
            ignoreCase="true"
            words="lang/stopwords_en.txt"
            enablePositionIncrements="true"
            />
    <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
    <filter class="solr.LowerCaseFilterFactory"/>
    <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
    <filter class="solr.PorterStemFilterFactory"/>
  </analyzer>
  <analyzer type="query">
    <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
    <filter class="solr.StopFilterFactory"
            ignoreCase="true"
            words="lang/stopwords_en.txt"
            enablePositionIncrements="true"
            />
    <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
    <filter class="solr.LowerCaseFilterFactory"/>
    <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
    <filter class="solr.PorterStemFilterFactory"/>
  </analyzer>
</fieldType>

<fieldType name="text_en_splitting_tight" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true">
  <analyzer>
    <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
    <filter class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_en.txt"/>
    <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
    <filter class="solr.LowerCaseFilterFactory"/>
    <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/>
    <filter class="solr.EnglishMinimalStemFilterFactory"/>
    <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
  </analyzer>
</fieldType>

<fieldType name="text_general_rev" class="solr.TextField" positionIncrementGap="100">
  <analyzer type="index">
    <tokenizer class="solr.StandardTokenizerFactory"/>
    <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
    <filter class="solr.LowerCaseFilterFactory"/>
    <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true"
       maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
  </analyzer>
  <analyzer type="query">
    <tokenizer class="solr.StandardTokenizerFactory"/>
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
    <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" />
    <filter class="solr.LowerCaseFilterFactory"/>
  </analyzer>
</fieldType>

<fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true">
  <analyzer>
    <tokenizer class="solr.KeywordTokenizerFactory"/>
    <filter class="solr.LowerCaseFilterFactory" />
    <filter class="solr.TrimFilterFactory" />
    <filter class="solr.PatternReplaceFilterFactory"
            pattern="([^a-z])" replacement="" replace="all"
    />
  </analyzer>
</fieldType>

<fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" >
  <analyzer>
    <tokenizer class="solr.StandardTokenizerFactory"/>
    <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
  </analyzer>
</fieldtype>

<fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" >
  <analyzer>
    <tokenizer class="solr.WhitespaceTokenizerFactory"/>
    <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/>
  </analyzer>
</fieldtype>

<fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100">
  <analyzer>
    <tokenizer class="solr.KeywordTokenizerFactory"/>
    <filter class="solr.LowerCaseFilterFactory" />
  </analyzer>
</fieldType>

<fieldType name="text_path" class="solr.TextField" positionIncrementGap="100">
  <analyzer>
    <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
  </analyzer>
</fieldType>

<fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" />

<fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/>

<fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/>

<fieldtype name="geohash" class="solr.GeoHashField"/>

<fieldType name="currency" class="solr.CurrencyField" precisionStep="8" defaultCurrency="USD" currencyConfig="currency.xml" />

                                        

<field name="weight" type="float" indexed="true" stored="true"/>
<field name="price"  type="float" indexed="true" stored="true"/>
<field name="popularity" type="int" indexed="true" stored="true" />
<field name="inStock" type="boolean" indexed="true" stored="true" />

<field name="store" type="location" indexed="true" stored="true"/>
<field name="title" type="text_general" indexed="true" stored="true" multiValued="true"/>
<field name="subject" type="text_general" indexed="true" stored="true"/>
<field name="description" type="text_general" indexed="true" stored="true"/>
<field name="comments" type="text_general" indexed="true" stored="true"/>
<field name="author" type="text_general" indexed="true" stored="true"/>
<field name="keywords" type="text_general" indexed="true" stored="true"/>
<field name="category" type="text_general" indexed="true" stored="true"/>
<field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/>
<field name="last_modified" type="date" indexed="true" stored="true"/>
<field name="links" type="string" indexed="true" stored="true" multiValued="true"/>

<field name="content" type="text_general" indexed="false" stored="true" multiValued="false"/>
<field name="text" type="text_general" indexed="true" stored="true" multiValued="true"/>

<field name="text_rev" type="text_general_rev" indexed="true" stored="false" multiValued="true"/>

<field name="manu_exact" type="string" indexed="true" stored="false"/>

<field name="payloads" type="payloads" indexed="true" stored="true"/>

<dynamicField name="*_i"  type="int"    indexed="true"  stored="true"/>
<dynamicField name="*_s"  type="string"  indexed="true"  stored="true"/>
<dynamicField name="*_l"  type="long"   indexed="true"  stored="true"/>
<dynamicField name="*_t"  type="text_general"    indexed="true"  stored="true"/>
<dynamicField name="*_txt" type="text_general"    indexed="true"  stored="true" multiValued="true"/>
<dynamicField name="*_en"  type="text_en"    indexed="true"  stored="true" multiValued="true" />
<dynamicField name="*_b"  type="boolean" indexed="true"  stored="true"/>
<dynamicField name="*_f"  type="float"  indexed="true"  stored="true"/>
<dynamicField name="*_d"  type="double" indexed="true"  stored="true"/>

<dynamicField name="*_coordinate"  type="tdouble" indexed="true"  stored="false"/>

<dynamicField name="*_dt" type="date"    indexed="true"  stored="true"/>
<dynamicField name="*_p"  type="location" indexed="true" stored="true"/>

<dynamicField name="*_ti" type="tint"    indexed="true"  stored="true"/>
<dynamicField name="*_tl" type="tlong"   indexed="true"  stored="true"/>
<dynamicField name="*_tf" type="tfloat"  indexed="true"  stored="true"/>
<dynamicField name="*_td" type="tdouble" indexed="true"  stored="true"/>
<dynamicField name="*_tdt" type="tdate"  indexed="true"  stored="true"/>

<dynamicField name="*_pi"  type="pint"    indexed="true"  stored="true"/>
<dynamicField name="*_c"   type="currency" indexed="true"  stored="true"/>

<dynamicField name="ignored_*" type="ignored" multiValued="true"/>
<dynamicField name="attr_*" type="text_general" indexed="true" stored="true" multiValued="true"/>

<dynamicField name="random_*" type="random" />

<uniqueKey>id</uniqueKey>

<copyField source="cat" dest="text"/>
<copyField source="name" dest="text"/>
<copyField source="manu" dest="text"/>
<copyField source="features" dest="text"/>
<copyField source="includes" dest="text"/>
<copyField source="manu" dest="manu_exact"/>

<copyField source="price" dest="price_c"/>

任何帮助都会很棒!

0 个答案:

没有答案